From 5626a1c78904db3e9134a486567aaf0fa92dbc71 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Thu, 23 Mar 2023 12:04:18 +0800 Subject: [PATCH 01/25] cambricon support gpt2 inference --- libai/config/configs | 1 + libai/inference/generator/generation_utils.py | 10 +++++----- libai/layers/activation.py | 4 +++- libai/layers/embedding.py | 7 +++---- libai/models/gpt_model.py | 1 - libai/tokenizer/tokenization_base.py | 2 +- libai/utils/distributed.py | 4 ++-- libai/version.py | 2 ++ projects/MagicPrompt/configs/gpt2_inference.py | 6 +++--- projects/MagicPrompt/pipeline.py | 4 ++-- 10 files changed, 22 insertions(+), 19 deletions(-) create mode 120000 libai/config/configs create mode 100644 libai/version.py diff --git a/libai/config/configs b/libai/config/configs new file mode 120000 index 000000000..4528f6785 --- /dev/null +++ b/libai/config/configs @@ -0,0 +1 @@ +/home/zhangxiaoyu/libai/configs \ No newline at end of file diff --git a/libai/inference/generator/generation_utils.py b/libai/inference/generator/generation_utils.py index 73df306b6..14b8e3dee 100644 --- a/libai/inference/generator/generation_utils.py +++ b/libai/inference/generator/generation_utils.py @@ -202,7 +202,7 @@ def _expand_inputs_for_generation( ) expanded_return_idx = expanded_return_idx.to_global( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=flow.placement("mlu", list(range(dist.get_world_size()))), ) input_ids = input_ids.index_select(0, expanded_return_idx) @@ -493,7 +493,7 @@ def greedy_search( scores += (next_token_scores,) # argmax - next_tokens = flow.argmax(next_token_scores, dim=-1) + next_tokens = flow.argmax(next_token_scores.to("cpu"), dim=-1).to("mlu") next_tokens = next_tokens.to_global(placement=input_ids.placement) unfinished_sequences = unfinished_sequences.to_global( sbp=next_tokens.sbp, placement=next_tokens.placement @@ -518,7 +518,7 @@ def greedy_search( # if eos_token was found in one sentence, set sentence to finished if eos_token_id is not None: unfinished_sequences = flow.mul( - unfinished_sequences, (next_tokens != eos_token_id).long() + unfinished_sequences, (next_tokens.to(flow.int32) != eos_token_id).long() ) if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): @@ -589,12 +589,12 @@ def multinomial_sample( probs = nn.functional.softmax(next_token_scores, dim=-1) probs = probs.to_global( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=flow.placement("mlu", list(range(dist.get_world_size()))), ).to_local() next_tokens = flow.multinomial(probs, num_samples=1).squeeze(1) next_tokens = next_tokens.to_global( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=flow.placement("mlu", list(range(dist.get_world_size()))), ) unfinished_sequences = unfinished_sequences.to_global( sbp=next_tokens.sbp, placement=next_tokens.placement diff --git a/libai/layers/activation.py b/libai/layers/activation.py index 0aa6e7da8..bba8dd4a7 100644 --- a/libai/layers/activation.py +++ b/libai/layers/activation.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math from enum import Enum from typing import Optional @@ -56,7 +57,8 @@ def forward(self, x: flow.Tensor) -> flow.Tensor: """When the approximate argument is 'tanh', Gelu is estimated with: 0.5 * x * (1.0 + flow.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * flow.pow(x, 3.0)))) """ - return flow.nn.functional.gelu(x, approximate="tanh") + # return flow.nn.functional.gelu(x, approximate="tanh") + return 0.5 * x * (1.0 + flow.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * flow.pow(x, 3.0)))) class QuickGELU(nn.Module): diff --git a/libai/layers/embedding.py b/libai/layers/embedding.py index 033ac6406..4dbb7b1f7 100644 --- a/libai/layers/embedding.py +++ b/libai/layers/embedding.py @@ -71,8 +71,8 @@ def __init__( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), ) ) - if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1": - self.init_method(self.weight) + # if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1": + # self.init_method(self.weight) # FIXME(lxy): Fill padding_idx is not supported in nd_sbp right now. # self._fill_padding_idx_with_zero() @@ -148,7 +148,7 @@ def __init__( ) ) # Initialize the word embedding - self.init_method(self.weight) + # self.init_method(self.weight) # FIXME(Lxy): Fill padding_idx is not supported in nd_sbp right now. # self._fill_padding_idx_with_zero() @@ -163,7 +163,6 @@ def forward(self, input_ids): input_embeds = flow._C.gather(weight, input_ids, axis=0) # Set the embeds sbp from [S(0), P] --> [S(0), B] to get complete embedding results. input_embeds = input_embeds.to_global(sbp=dist.get_hidden_sbp()) - return input_embeds def _fill_padding_idx_with_zero(self) -> None: diff --git a/libai/models/gpt_model.py b/libai/models/gpt_model.py index 27f6bc8e9..cc65a99bc 100644 --- a/libai/models/gpt_model.py +++ b/libai/models/gpt_model.py @@ -203,7 +203,6 @@ def forward(self, input_ids): Returns: flow.Tensor: logits """ - input_ids = input_ids.to_global(placement=dist.get_layer_placement(0)) input_embeds = self.embeddings(input_ids, 0) diff --git a/libai/tokenizer/tokenization_base.py b/libai/tokenizer/tokenization_base.py index d5bd5a9ea..e2e439e04 100644 --- a/libai/tokenizer/tokenization_base.py +++ b/libai/tokenizer/tokenization_base.py @@ -783,7 +783,7 @@ def convert_to_tensors(self, token_ids, return_tensors=None, is_global=False, ** elif is_global: sbp = kwargs.get("sbp", dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])) placement = kwargs.get( - "placement", flow.placement("cuda", list(range(dist.get_world_size()))) + "placement", flow.placement("mlu", list(range(dist.get_world_size()))) ) return_token_ids = flow.tensor( token_ids, sbp=sbp, placement=placement, dtype=flow.long diff --git a/libai/utils/distributed.py b/libai/utils/distributed.py index e7914a0ad..4dd934c6f 100644 --- a/libai/utils/distributed.py +++ b/libai/utils/distributed.py @@ -71,7 +71,7 @@ def _init_distributed_env(self, cfg): self._world_size = num_gpus_per_node * num_nodes # Add set device type - self._device_type = try_get_key(cfg, "device_type", default="cuda") + self._device_type = try_get_key(cfg, "device_type", default="mlu") def _init_parallel_size(self, cfg): @@ -438,7 +438,7 @@ def convert_to_distributed_default_setting(t): def ttol(tensor, pure_local=False, ranks=None): """Global tensor to local tensor.""" if tensor.is_global: - placement = tensor.placement if not ranks else flow.placement("cuda", ranks) + placement = tensor.placement if not ranks else flow.placement("mlu", ranks) if pure_local: tensor = tensor.to_global(placement=placement).to_local() else: diff --git a/libai/version.py b/libai/version.py new file mode 100644 index 000000000..1a99cd82a --- /dev/null +++ b/libai/version.py @@ -0,0 +1,2 @@ +__version__ = '0.2.0' +git_version = '50a973dc5de635b8613ad7666c073c763e238850' diff --git a/projects/MagicPrompt/configs/gpt2_inference.py b/projects/MagicPrompt/configs/gpt2_inference.py index a78b51f78..daf139013 100644 --- a/projects/MagicPrompt/configs/gpt2_inference.py +++ b/projects/MagicPrompt/configs/gpt2_inference.py @@ -56,14 +56,14 @@ sep_token_id=None, decoder_start_token_id=None, # train - pretrained_model_path="/data/home/magicprompt", + pretrained_model_path="/home/zhangxiaoyu/libai/oneflow-model", ) model = LazyCall(GPTModel)(cfg=cfg) pretrain_model = LazyCall(GPTForPreTraining)(cfg=cfg) tokenization.tokenizer = LazyCall(GPT2Tokenizer)( - vocab_file="/data/home/magicprompt/vocab.json", - merges_file="/data/home/magicprompt/merges.txt", + vocab_file="/home/zhangxiaoyu/oneflow-model/vocab.json", + merges_file="/home/zhangxiaoyu/oneflow-model/merges.txt", add_bos_token=True, ) diff --git a/projects/MagicPrompt/pipeline.py b/projects/MagicPrompt/pipeline.py index 1da8e5b4e..0a183a08d 100644 --- a/projects/MagicPrompt/pipeline.py +++ b/projects/MagicPrompt/pipeline.py @@ -76,7 +76,7 @@ def preprocess(self, inputs, **kwargs) -> dict: return inputs def forward(self, inputs, **kwargs) -> dict: - outputs = self.model.generate(inputs["input_ids"], do_sample=True, max_length=50, **kwargs) + outputs = self.model.generate(inputs["input_ids"], do_sample=False, max_length=50, **kwargs) return {"return_ids": outputs} def postprocess(self, model_output_dict, **kwargs) -> dict: @@ -96,7 +96,7 @@ def postprocess(self, model_output_dict, **kwargs) -> dict: pipeline_parallel=1, # pipeline_stage_id=[0] * 6 + [1] * 6, # pipeline_num_layers=12, - model_path="/path/to/oneflow-model", + model_path="/home/zhangxiaoyu/oneflow-model/model", mode="libai", ) From 73c6732953850b0d2b8e7d66543026d8441ebaae Mon Sep 17 00:00:00 2001 From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Date: Thu, 23 Mar 2023 12:11:43 +0800 Subject: [PATCH 02/25] Delete configs --- libai/config/configs | 1 - 1 file changed, 1 deletion(-) delete mode 120000 libai/config/configs diff --git a/libai/config/configs b/libai/config/configs deleted file mode 120000 index 4528f6785..000000000 --- a/libai/config/configs +++ /dev/null @@ -1 +0,0 @@ -/home/zhangxiaoyu/libai/configs \ No newline at end of file From 48062f5e4733dbd5fa0fab3f928410be86ef0118 Mon Sep 17 00:00:00 2001 From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Date: Thu, 23 Mar 2023 12:11:53 +0800 Subject: [PATCH 03/25] Delete version.py --- libai/version.py | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 libai/version.py diff --git a/libai/version.py b/libai/version.py deleted file mode 100644 index 1a99cd82a..000000000 --- a/libai/version.py +++ /dev/null @@ -1,2 +0,0 @@ -__version__ = '0.2.0' -git_version = '50a973dc5de635b8613ad7666c073c763e238850' From 9e8001893d6ec8ab470b36cc36b15555c84c4063 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Tue, 28 Mar 2023 10:11:52 +0800 Subject: [PATCH 04/25] refine train script --- configs/common/data/gpt_dataset.py | 72 +- libai/config/configs | 1 + libai/version.py | 2 + projects/MagicPrompt/configs/gpt2_training.py | 12 +- .../oneflow_magicprompt/config.yaml | 76 + .../events.out.tfevents.1679969191.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1679969426.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1679969473.oneflow-32 | Bin 0 -> 40 bytes .../MagicPrompt/oneflow_magicprompt/log.txt | 1587 +++++++++++++++++ .../oneflow_magicprompt/metrics.json | 0 tools/train_net.py | 3 +- 11 files changed, 1710 insertions(+), 43 deletions(-) create mode 120000 libai/config/configs create mode 100644 libai/version.py create mode 100644 projects/MagicPrompt/oneflow_magicprompt/config.yaml create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969191.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969426.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969473.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt create mode 100644 projects/MagicPrompt/oneflow_magicprompt/metrics.json diff --git a/configs/common/data/gpt_dataset.py b/configs/common/data/gpt_dataset.py index 781eef1b0..0bcccec42 100644 --- a/configs/common/data/gpt_dataset.py +++ b/configs/common/data/gpt_dataset.py @@ -20,40 +20,40 @@ dataloader = OmegaConf.create() -dataloader.train = LazyCall(build_nlp_train_val_test_loader)( - dataset=[ - LazyCall(GPT2Dataset)( - name="gpt-2", - data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", - indexed_dataset=LazyCall(get_indexed_dataset)( - data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", - data_impl="mmap", - skip_warmup=False, - ), - max_seq_length=1024, - seed=1234, - ), - ], - train_val_test_num_samples=None, # a hint for deferred assignment - splits=[[949.0, 50.0, 1.0]], - weights=[1.0], - num_workers=4, -) +# dataloader.train = LazyCall(build_nlp_train_val_test_loader)( +# dataset=[ +# LazyCall(GPT2Dataset)( +# name="gpt-2", +# data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", +# indexed_dataset=LazyCall(get_indexed_dataset)( +# data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", +# data_impl="mmap", +# skip_warmup=False, +# ), +# max_seq_length=1024, +# seed=1234, +# ), +# ], +# train_val_test_num_samples=None, # a hint for deferred assignment +# splits=[[949.0, 50.0, 1.0]], +# weights=[1.0], +# num_workers=4, +# ) -dataloader.test = [ - LazyCall(build_nlp_test_loader)( - dataset=LazyCall(GPT2Dataset)( - name="gpt-2", - data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", - indexed_dataset=LazyCall(get_indexed_dataset)( - data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", - data_impl="mmap", - skip_warmup=False, - ), - max_seq_length=1024, - max_num_samples=10, - seed=1234, - ), - test_batch_size=4, - ) -] +# dataloader.test = [ +# LazyCall(build_nlp_test_loader)( +# dataset=LazyCall(GPT2Dataset)( +# name="gpt-2", +# data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", +# indexed_dataset=LazyCall(get_indexed_dataset)( +# data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", +# data_impl="mmap", +# skip_warmup=False, +# ), +# max_seq_length=1024, +# max_num_samples=10, +# seed=1234, +# ), +# test_batch_size=4, +# ) +# ] diff --git a/libai/config/configs b/libai/config/configs new file mode 120000 index 000000000..4528f6785 --- /dev/null +++ b/libai/config/configs @@ -0,0 +1 @@ +/home/zhangxiaoyu/libai/configs \ No newline at end of file diff --git a/libai/version.py b/libai/version.py new file mode 100644 index 000000000..36ee184fb --- /dev/null +++ b/libai/version.py @@ -0,0 +1,2 @@ +__version__ = '0.2.0' +git_version = '89f63fc3be24e4d921b08656d051e143fcb70ef8' diff --git a/projects/MagicPrompt/configs/gpt2_training.py b/projects/MagicPrompt/configs/gpt2_training.py index 69406f1a6..d648d0f52 100644 --- a/projects/MagicPrompt/configs/gpt2_training.py +++ b/projects/MagicPrompt/configs/gpt2_training.py @@ -10,9 +10,9 @@ from configs.common.models.graph import graph -vocab_file = "/data/home/magicprompt/vocab.json" -merge_files = "/data/home/magicprompt/merges.txt" -train_data_prefix = "/data/home/magicprompt/train/en_train_mmap_text_sentence" +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" tokenization.tokenizer.vocab_file = vocab_file tokenization.tokenizer.merges_file = merge_files @@ -33,9 +33,9 @@ model.cfg.vocab_size = 50257 model.cfg.layernorm_epsilon = 1e-5 model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = True -model.cfg.bias_dropout_fusion = True -model.cfg.scale_mask_softmax_fusion = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False model.cfg.apply_query_key_layer_scaling = True model.cfg.apply_residual_post_layernorm = False model.cfg.amp_enabled = True diff --git a/projects/MagicPrompt/oneflow_magicprompt/config.yaml b/projects/MagicPrompt/oneflow_magicprompt/config.yaml new file mode 100644 index 000000000..6131f2ddc --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/config.yaml @@ -0,0 +1,76 @@ +dataloader: + train: + _target_: libai.data.build_nlp_train_val_test_loader + dataset: + - _target_: libai.data.datasets.GPT2Dataset + data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence + indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} + max_seq_length: 1024 + name: gpt-2 + seed: 1234 + num_workers: 4 + splits: + - [949.0, 50.0, 1.0] + train_val_test_num_samples: null + weights: [1.0] +ds: + _target_: libai.data.datasets.GPT2Dataset + data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence + indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} + max_seq_length: 1024 + name: gpt-2 + seed: 1234 +graph: + auto_parallel: {enable_auto_parallel_ignore_user_sbp_config: false, enabled: false, sbp_collector: false, trunk_algo: true} + debug: -1 + enabled: true + eval_graph: {_target_: libai.models.utils.GraphBase, is_train: false} + train_graph: {_target_: libai.models.utils.GraphBase, is_train: true} +model: + _target_: projects.MagicPrompt.gpt2.GPTForPreTraining + cfg: {amp_enabled: true, apply_query_key_layer_scaling: true, apply_residual_post_layernorm: false, attention_dropout_prob: 0.1, bias_dropout_fusion: false, bias_gelu_fusion: false, bos_token_id: 50256, chunk_size_feed_forward: 0, decoder_start_token_id: null, diversity_penalty: 0.0, do_sample: false, early_stopping: false, embedding_dropout_prob: 0.1, encoder_no_repeat_ngram_size: 0, eos_token_id: 50256, exponential_decay_length_penalty: null, ffn_hidden_size: 3072, forced_bos_token_id: null, forced_eos_token_id: null, hidden_layers: 12, hidden_size: 768, initializer_range: 0.02, is_encoder_decoder: false, layernorm_epsilon: 1.0e-05, length_penalty: 1.0, max_length: 20, max_seq_length: 1024, min_length: 0, no_repeat_ngram_size: 0, num_attention_heads: 12, num_beam_groups: 1, num_beams: 1, num_return_sequences: 1, output_dropout_prob: 0.1, output_scores: false, pad_token_id: 0, pretrained_model_path: null, remove_invalid_values: false, repetition_penalty: 1.0, scale_mask_softmax_fusion: false, sep_token_id: null, temperature: 1.0, top_k: 50, top_p: 1.0, typical_p: 1.0, use_cache: true, use_scaled_init_for_output_weights: true, vocab_size: 50257} +optim: + _target_: oneflow.nn.optimizer.adamw.AdamW + betas: [0.9, 0.999] + do_bias_correction: true + eps: 1.0e-08 + lr: 5.0e-05 + params: {_target_: libai.optim.get_default_optimizer_params, clip_grad_max_norm: 1.0, clip_grad_norm_type: 2.0, weight_decay_bias: 0.0, weight_decay_norm: 0.0} + weight_decay: 0.01 +tokenization: + append_eod: false + make_vocab_size_divisible_by: 128 + tokenizer: {_target_: libai.tokenizer.GPT2Tokenizer, do_chinese_wwm: true, do_lower_case: true, merges_file: /home/zhangxiaoyu/magicprompt/merges.txt, vocab_file: /home/zhangxiaoyu/magicprompt/vocab.json} +train: + activation_checkpoint: {enabled: false} + amp: {enabled: true} + checkpointer: {max_to_keep: 20, period: 8000} + consumed_train_samples: 0 + consumed_valid_samples: 0 + dist: {data_parallel_size: 1, num_gpus_per_node: 1, num_nodes: 1, pipeline_num_layers: 10000, pipeline_parallel_size: 1, tensor_parallel_size: 1} + evaluation: + enabled: true + eval_iter: 250 + eval_period: 4000 + evaluator: {_target_: libai.evaluation.PPLEvaluator} + global_batch_size: 4 + input_placement_device: cpu + load_weight: '' + log_period: 50 + nccl_fusion_max_ops: 24 + nccl_fusion_threshold_mb: 16 + num_accumulation_steps: 1 + output_dir: projects/MagicPrompt/oneflow_magicprompt + rdma_enabled: false + resume: false + samples: 40000 + scheduler: {_target_: libai.scheduler.WarmupExponentialLR, gamma: 1.0, warmup_factor: 0.0, warmup_iter: 0.0, warmup_method: linear} + seed: 1234 + start_iter: 0 + test_micro_batch_size: 4 + train_epoch: 33 + train_iter: 10000 + train_micro_batch_size: 4 + train_samples: null + warmup_ratio: 0 + zero_optimization: {enabled: false, stage: 1} diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969191.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969191.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..f799b14cf25f492648c539794bac8f5bd78beb43 GIT binary patch literal 40 rcmb1OfPlsI-b$SF*e<-B#BtM6iZ`h!F*8rkwJbHS#LB2dOW^bc`bKG>4;!P?_%*@ksElbTSu`)Wmzh*xG&od1T literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969473.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969473.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..45678997c849a0bef0930a79e75c3162f93ea511 GIT binary patch literal 40 rcmb1OfPlsI-b$Qon;Z=$bKG>4;!P?_%*@ksElbTSu`*KG{l^Xfv`h compiling dataset index builder ... +[03/28 09:15:12] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.053 seconds +[03/28 09:15:12] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.053 seconds +[03/28 09:37:48] libai INFO: Rank of current process: 0. World size: 1 +[03/28 09:37:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 09:37:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = True +model.cfg.bias_dropout_fusion = True +model.cfg.scale_mask_softmax_fusion = True +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = True + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 09:37:48] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 09:37:48] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 09:37:48] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.052 seconds +[03/28 09:37:48] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.052 seconds +[03/28 09:38:23] libai INFO: Rank of current process: 0. World size: 1 +[03/28 09:38:23] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 09:38:23] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = True +model.cfg.bias_dropout_fusion = True +model.cfg.scale_mask_softmax_fusion = True +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = True + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 09:38:23] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 09:38:23] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 09:38:23] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds +[03/28 09:38:23] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.056 seconds +[03/28 09:44:17] libai INFO: Rank of current process: 0. World size: 1 +[03/28 09:44:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 09:44:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = True +model.cfg.bias_dropout_fusion = True +model.cfg.scale_mask_softmax_fusion = True +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = True + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 09:44:17] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 09:44:17] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 09:44:17] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.050 seconds +[03/28 09:44:17] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.051 seconds +[03/28 09:44:19] lb.engine.default INFO: Prepare training, validating, testing set +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007179 seconds +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > last epoch number of samples (3905) is larger than 80% of number of samples per epoch (4511), setting separate_last_epoch to False +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.035229 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.002511 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 40607) and [40607, 40607) ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.001179 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > only one epoch required, setting separate_last_epoch to False +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.003403 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000375 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 4511) and [4511, 4511) ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000237 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > only one epoch required, setting separate_last_epoch to False +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.003150 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000333 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 4511) and [4511, 4511) ... +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000232 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 09:44:19] lb.engine.default INFO: Prepare testing set +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: Dataset does not exist: /workspace/data/libai_dataset/loss_compara_content_sentence +[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: Path should be a basename that both .idx and .bin can be appended to get full filenames. +[03/28 10:06:27] libai INFO: Rank of current process: 0. World size: 1 +[03/28 10:06:27] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 10:06:27] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = True +model.cfg.bias_dropout_fusion = True +model.cfg.scale_mask_softmax_fusion = True +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = True + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 10:06:27] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 10:06:27] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 10:06:27] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.050 seconds +[03/28 10:06:27] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.051 seconds +[03/28 10:06:29] lb.engine.default INFO: Prepare training, validating, testing set +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007199 seconds +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 10:06:29] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/28 10:06:29] libai INFO: > Start building model... +[03/28 10:06:30] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/28 10:06:30] libai INFO: >>> done with building model. Building time: 0.277 seconds +[03/28 10:06:30] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/28 10:06:31] lb.engine.trainer INFO: Starting training from iteration 0 +[03/28 10:06:31] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/28 10:06:36] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime + self._c_nn_graph.compile_plan_for_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: There is no kernel registered for Current OperatorConf. The Info of OperatorConf are + op_name: model.GPT_model.transformer.layers.0.self_attention-fused_tril_scale_softmax_mask_scale-29 + op_type_name: fused_tril_scale_softmax_mask_scale + DeviceType_Name: kMLU + DataType_Name of x_0: kFloat + DataType_Name of mask_0: kBool + DataType_Name of softmax_y_0: kFloat + DataType_Name of y_0: kFloat +Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name File "/home/zhangxiaoyu/libai/projects/MagicPrompt/layers/attention_layer.py", line 241, in forward, source < attention_weights = flow._C.fused_scale_tril_softmax_mask_scale( >; File "/home/zhangxiaoyu/libai/projects/MagicPrompt/layers/transformer_layer.py", line 180, in forward, source < attention_output = self.self_attention( >; ... 10 more + + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf + InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs + user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) +Error Type: oneflow.ErrorProto.op_kernel_not_found_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/28 10:06:36] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) +[03/28 10:10:22] libai INFO: Rank of current process: 0. World size: 1 +[03/28 10:10:22] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 10:10:22] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = True +model.cfg.bias_dropout_fusion = True +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = True + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 10:10:22] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 10:10:22] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 10:10:22] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.044 seconds +[03/28 10:10:22] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.045 seconds +[03/28 10:10:24] lb.engine.default INFO: Prepare training, validating, testing set +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007224 seconds +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 10:10:24] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/28 10:10:24] libai INFO: > Start building model... +[03/28 10:10:24] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/28 10:10:24] libai INFO: >>> done with building model. Building time: 0.393 seconds +[03/28 10:10:24] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/28 10:10:26] lb.engine.trainer INFO: Starting training from iteration 0 +[03/28 10:10:26] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/28 10:10:31] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime + self._c_nn_graph.compile_plan_for_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: There is no kernel registered for Current OperatorConf. The Info of OperatorConf are + op_name: model.GPT_model.transformer.layers.0.self_attention-fused_bias_add_mask_scale-36 + op_type_name: fused_bias_add_mask_scale + DeviceType_Name: kMLU + DataType_Name of a_0: kFloat + DataType_Name of b_0: kFloat + DataType_Name of mask_0: kBool + DataType_Name of _add_to_output_0: kFloat + DataType_Name of out_0: kFloat +Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name File "/home/zhangxiaoyu/libai/projects/MagicPrompt/layers/attention_layer.py", line 258, in forward, source < output = flow._C.fused_bias_add_dropout( >; File "/home/zhangxiaoyu/libai/projects/MagicPrompt/layers/transformer_layer.py", line 180, in forward, source < attention_output = self.self_attention( >; ... 10 more + + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf + InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs + user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) +Error Type: oneflow.ErrorProto.op_kernel_not_found_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/28 10:10:31] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) +[03/28 10:11:09] libai INFO: Rank of current process: 0. World size: 1 +[03/28 10:11:09] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 10:11:09] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = True + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 10:11:09] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 10:11:09] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 10:11:09] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.081 seconds +[03/28 10:11:09] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.082 seconds +[03/28 10:11:11] lb.engine.default INFO: Prepare training, validating, testing set +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007395 seconds +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 10:11:11] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/28 10:11:11] libai INFO: > Start building model... +[03/28 10:11:11] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/28 10:11:11] libai INFO: >>> done with building model. Building time: 0.231 seconds +[03/28 10:11:11] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/28 10:11:13] lb.engine.trainer INFO: Starting training from iteration 0 +[03/28 10:11:13] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/28 10:11:19] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime + self._c_nn_graph.compile_plan_for_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are + op_name: model.GPT_model.transformer.layers.11.mlp.activation_func-tanh_grad-601 + op_type_name: tanh_grad + DeviceType_Name: kMLU + DataType_Name of dy_0: kFloat + DataType_Name of x_0: kFloat + DataType_Name of dx_0: kFloat +Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name File "/home/zhangxiaoyu/libai/libai/models/utils/graph_base.py", line 107, in build, source < losses.backward() >; File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step, source < loss_dict = self.graph(**data) >; ... 5 more + + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf + InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs + user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) +Error Type: oneflow.ErrorProto.op_kernel_not_found_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/28 10:11:19] lb.engine.hooks INFO: Total training time: 0:00:06 (0:00:00 on hooks) diff --git a/projects/MagicPrompt/oneflow_magicprompt/metrics.json b/projects/MagicPrompt/oneflow_magicprompt/metrics.json new file mode 100644 index 000000000..e69de29bb diff --git a/tools/train_net.py b/tools/train_net.py index 458849c75..1d6080631 100644 --- a/tools/train_net.py +++ b/tools/train_net.py @@ -36,7 +36,8 @@ def main(args): seed_for_rank = cfg.train.seed + flow.env.get_rank() flow.manual_seed(seed_for_rank) - flow.cuda.manual_seed(seed_for_rank) + # flow.cuda.manual_seed(seed_for_rank) + flow._oneflow_internal.default_generator("mlu").manual_seed(seed_for_rank) np.random.seed(seed_for_rank) random.seed(seed_for_rank) From ec311a739120c1f93edf0dcc86627f61e84bcb02 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Tue, 28 Mar 2023 10:12:20 +0800 Subject: [PATCH 05/25] refine train script --- .../oneflow_magicprompt/config.yaml | 76 - .../events.out.tfevents.1679969191.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1679969426.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1679969473.oneflow-32 | Bin 40 -> 0 bytes .../MagicPrompt/oneflow_magicprompt/log.txt | 1587 ----------------- .../oneflow_magicprompt/metrics.json | 0 6 files changed, 1663 deletions(-) delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/config.yaml delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969191.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969426.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969473.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/metrics.json diff --git a/projects/MagicPrompt/oneflow_magicprompt/config.yaml b/projects/MagicPrompt/oneflow_magicprompt/config.yaml deleted file mode 100644 index 6131f2ddc..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/config.yaml +++ /dev/null @@ -1,76 +0,0 @@ -dataloader: - train: - _target_: libai.data.build_nlp_train_val_test_loader - dataset: - - _target_: libai.data.datasets.GPT2Dataset - data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence - indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} - max_seq_length: 1024 - name: gpt-2 - seed: 1234 - num_workers: 4 - splits: - - [949.0, 50.0, 1.0] - train_val_test_num_samples: null - weights: [1.0] -ds: - _target_: libai.data.datasets.GPT2Dataset - data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence - indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} - max_seq_length: 1024 - name: gpt-2 - seed: 1234 -graph: - auto_parallel: {enable_auto_parallel_ignore_user_sbp_config: false, enabled: false, sbp_collector: false, trunk_algo: true} - debug: -1 - enabled: true - eval_graph: {_target_: libai.models.utils.GraphBase, is_train: false} - train_graph: {_target_: libai.models.utils.GraphBase, is_train: true} -model: - _target_: projects.MagicPrompt.gpt2.GPTForPreTraining - cfg: {amp_enabled: true, apply_query_key_layer_scaling: true, apply_residual_post_layernorm: false, attention_dropout_prob: 0.1, bias_dropout_fusion: false, bias_gelu_fusion: false, bos_token_id: 50256, chunk_size_feed_forward: 0, decoder_start_token_id: null, diversity_penalty: 0.0, do_sample: false, early_stopping: false, embedding_dropout_prob: 0.1, encoder_no_repeat_ngram_size: 0, eos_token_id: 50256, exponential_decay_length_penalty: null, ffn_hidden_size: 3072, forced_bos_token_id: null, forced_eos_token_id: null, hidden_layers: 12, hidden_size: 768, initializer_range: 0.02, is_encoder_decoder: false, layernorm_epsilon: 1.0e-05, length_penalty: 1.0, max_length: 20, max_seq_length: 1024, min_length: 0, no_repeat_ngram_size: 0, num_attention_heads: 12, num_beam_groups: 1, num_beams: 1, num_return_sequences: 1, output_dropout_prob: 0.1, output_scores: false, pad_token_id: 0, pretrained_model_path: null, remove_invalid_values: false, repetition_penalty: 1.0, scale_mask_softmax_fusion: false, sep_token_id: null, temperature: 1.0, top_k: 50, top_p: 1.0, typical_p: 1.0, use_cache: true, use_scaled_init_for_output_weights: true, vocab_size: 50257} -optim: - _target_: oneflow.nn.optimizer.adamw.AdamW - betas: [0.9, 0.999] - do_bias_correction: true - eps: 1.0e-08 - lr: 5.0e-05 - params: {_target_: libai.optim.get_default_optimizer_params, clip_grad_max_norm: 1.0, clip_grad_norm_type: 2.0, weight_decay_bias: 0.0, weight_decay_norm: 0.0} - weight_decay: 0.01 -tokenization: - append_eod: false - make_vocab_size_divisible_by: 128 - tokenizer: {_target_: libai.tokenizer.GPT2Tokenizer, do_chinese_wwm: true, do_lower_case: true, merges_file: /home/zhangxiaoyu/magicprompt/merges.txt, vocab_file: /home/zhangxiaoyu/magicprompt/vocab.json} -train: - activation_checkpoint: {enabled: false} - amp: {enabled: true} - checkpointer: {max_to_keep: 20, period: 8000} - consumed_train_samples: 0 - consumed_valid_samples: 0 - dist: {data_parallel_size: 1, num_gpus_per_node: 1, num_nodes: 1, pipeline_num_layers: 10000, pipeline_parallel_size: 1, tensor_parallel_size: 1} - evaluation: - enabled: true - eval_iter: 250 - eval_period: 4000 - evaluator: {_target_: libai.evaluation.PPLEvaluator} - global_batch_size: 4 - input_placement_device: cpu - load_weight: '' - log_period: 50 - nccl_fusion_max_ops: 24 - nccl_fusion_threshold_mb: 16 - num_accumulation_steps: 1 - output_dir: projects/MagicPrompt/oneflow_magicprompt - rdma_enabled: false - resume: false - samples: 40000 - scheduler: {_target_: libai.scheduler.WarmupExponentialLR, gamma: 1.0, warmup_factor: 0.0, warmup_iter: 0.0, warmup_method: linear} - seed: 1234 - start_iter: 0 - test_micro_batch_size: 4 - train_epoch: 33 - train_iter: 10000 - train_micro_batch_size: 4 - train_samples: null - warmup_ratio: 0 - zero_optimization: {enabled: false, stage: 1} diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969191.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969191.oneflow-32 deleted file mode 100644 index f799b14cf25f492648c539794bac8f5bd78beb43..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$SF*e<-B#BtM6iZ`h!F*8rkwJbHS#LB2dOW^bc`bKG>4;!P?_%*@ksElbTSu`)Wmzh*xG&od1T diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969473.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679969473.oneflow-32 deleted file mode 100644 index 45678997c849a0bef0930a79e75c3162f93ea511..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$Qon;Z=$bKG>4;!P?_%*@ksElbTSu`*KG{l^Xfv`h compiling dataset index builder ... -[03/28 09:15:12] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.053 seconds -[03/28 09:15:12] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.053 seconds -[03/28 09:37:48] libai INFO: Rank of current process: 0. World size: 1 -[03/28 09:37:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 09:37:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = True -model.cfg.bias_dropout_fusion = True -model.cfg.scale_mask_softmax_fusion = True -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = True - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 09:37:48] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 09:37:48] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 09:37:48] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.052 seconds -[03/28 09:37:48] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.052 seconds -[03/28 09:38:23] libai INFO: Rank of current process: 0. World size: 1 -[03/28 09:38:23] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 09:38:23] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = True -model.cfg.bias_dropout_fusion = True -model.cfg.scale_mask_softmax_fusion = True -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = True - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 09:38:23] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 09:38:23] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 09:38:23] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds -[03/28 09:38:23] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.056 seconds -[03/28 09:44:17] libai INFO: Rank of current process: 0. World size: 1 -[03/28 09:44:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 09:44:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = True -model.cfg.bias_dropout_fusion = True -model.cfg.scale_mask_softmax_fusion = True -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = True - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 09:44:17] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 09:44:17] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 09:44:17] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.050 seconds -[03/28 09:44:17] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.051 seconds -[03/28 09:44:19] lb.engine.default INFO: Prepare training, validating, testing set -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007179 seconds -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > last epoch number of samples (3905) is larger than 80% of number of samples per epoch (4511), setting separate_last_epoch to False -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.035229 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.002511 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 40607) and [40607, 40607) ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.001179 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > only one epoch required, setting separate_last_epoch to False -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.003403 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000375 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 4511) and [4511, 4511) ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000237 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > only one epoch required, setting separate_last_epoch to False -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.003150 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000333 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 4511) and [4511, 4511) ... -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000232 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 09:44:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 09:44:19] lb.engine.default INFO: Prepare testing set -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: Dataset does not exist: /workspace/data/libai_dataset/loss_compara_content_sentence -[03/28 09:44:19] lb.data.data_utils.indexed_dataset INFO: Path should be a basename that both .idx and .bin can be appended to get full filenames. -[03/28 10:06:27] libai INFO: Rank of current process: 0. World size: 1 -[03/28 10:06:27] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 10:06:27] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = True -model.cfg.bias_dropout_fusion = True -model.cfg.scale_mask_softmax_fusion = True -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = True - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 10:06:27] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 10:06:27] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 10:06:27] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.050 seconds -[03/28 10:06:27] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.051 seconds -[03/28 10:06:29] lb.engine.default INFO: Prepare training, validating, testing set -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007199 seconds -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/28 10:06:29] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 10:06:29] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 10:06:29] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/28 10:06:29] libai INFO: > Start building model... -[03/28 10:06:30] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/28 10:06:30] libai INFO: >>> done with building model. Building time: 0.277 seconds -[03/28 10:06:30] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/28 10:06:31] lb.engine.trainer INFO: Starting training from iteration 0 -[03/28 10:06:31] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/28 10:06:36] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime - self._c_nn_graph.compile_plan_for_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: There is no kernel registered for Current OperatorConf. The Info of OperatorConf are - op_name: model.GPT_model.transformer.layers.0.self_attention-fused_tril_scale_softmax_mask_scale-29 - op_type_name: fused_tril_scale_softmax_mask_scale - DeviceType_Name: kMLU - DataType_Name of x_0: kFloat - DataType_Name of mask_0: kBool - DataType_Name of softmax_y_0: kFloat - DataType_Name of y_0: kFloat -Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name File "/home/zhangxiaoyu/libai/projects/MagicPrompt/layers/attention_layer.py", line 241, in forward, source < attention_weights = flow._C.fused_scale_tril_softmax_mask_scale( >; File "/home/zhangxiaoyu/libai/projects/MagicPrompt/layers/transformer_layer.py", line 180, in forward, source < attention_output = self.self_attention( >; ... 10 more - - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf - InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs - user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) -Error Type: oneflow.ErrorProto.op_kernel_not_found_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/28 10:06:36] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) -[03/28 10:10:22] libai INFO: Rank of current process: 0. World size: 1 -[03/28 10:10:22] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 10:10:22] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = True -model.cfg.bias_dropout_fusion = True -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = True - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 10:10:22] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 10:10:22] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 10:10:22] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.044 seconds -[03/28 10:10:22] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.045 seconds -[03/28 10:10:24] lb.engine.default INFO: Prepare training, validating, testing set -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007224 seconds -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/28 10:10:24] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 10:10:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 10:10:24] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/28 10:10:24] libai INFO: > Start building model... -[03/28 10:10:24] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=True, bias_dropout_fusion=True, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/28 10:10:24] libai INFO: >>> done with building model. Building time: 0.393 seconds -[03/28 10:10:24] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/28 10:10:26] lb.engine.trainer INFO: Starting training from iteration 0 -[03/28 10:10:26] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/28 10:10:31] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime - self._c_nn_graph.compile_plan_for_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: There is no kernel registered for Current OperatorConf. The Info of OperatorConf are - op_name: model.GPT_model.transformer.layers.0.self_attention-fused_bias_add_mask_scale-36 - op_type_name: fused_bias_add_mask_scale - DeviceType_Name: kMLU - DataType_Name of a_0: kFloat - DataType_Name of b_0: kFloat - DataType_Name of mask_0: kBool - DataType_Name of _add_to_output_0: kFloat - DataType_Name of out_0: kFloat -Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name File "/home/zhangxiaoyu/libai/projects/MagicPrompt/layers/attention_layer.py", line 258, in forward, source < output = flow._C.fused_bias_add_dropout( >; File "/home/zhangxiaoyu/libai/projects/MagicPrompt/layers/transformer_layer.py", line 180, in forward, source < attention_output = self.self_attention( >; ... 10 more - - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf - InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs - user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) -Error Type: oneflow.ErrorProto.op_kernel_not_found_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/28 10:10:31] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) -[03/28 10:11:09] libai INFO: Rank of current process: 0. World size: 1 -[03/28 10:11:09] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 10:11:09] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = True - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 10:11:09] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 10:11:09] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 10:11:09] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.081 seconds -[03/28 10:11:09] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.082 seconds -[03/28 10:11:11] lb.engine.default INFO: Prepare training, validating, testing set -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007395 seconds -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/28 10:11:11] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 10:11:11] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 10:11:11] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/28 10:11:11] libai INFO: > Start building model... -[03/28 10:11:11] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/28 10:11:11] libai INFO: >>> done with building model. Building time: 0.231 seconds -[03/28 10:11:11] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/28 10:11:13] lb.engine.trainer INFO: Starting training from iteration 0 -[03/28 10:11:13] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/28 10:11:19] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime - self._c_nn_graph.compile_plan_for_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are - op_name: model.GPT_model.transformer.layers.11.mlp.activation_func-tanh_grad-601 - op_type_name: tanh_grad - DeviceType_Name: kMLU - DataType_Name of dy_0: kFloat - DataType_Name of x_0: kFloat - DataType_Name of dx_0: kFloat -Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name File "/home/zhangxiaoyu/libai/libai/models/utils/graph_base.py", line 107, in build, source < losses.backward() >; File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step, source < loss_dict = self.graph(**data) >; ... 5 more - - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf - InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs - user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) -Error Type: oneflow.ErrorProto.op_kernel_not_found_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/28 10:11:19] lb.engine.hooks INFO: Total training time: 0:00:06 (0:00:00 on hooks) diff --git a/projects/MagicPrompt/oneflow_magicprompt/metrics.json b/projects/MagicPrompt/oneflow_magicprompt/metrics.json deleted file mode 100644 index e69de29bb..000000000 From 22b8f86aa1991d40a7c25fe2cc1ad87c05209315 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Tue, 28 Mar 2023 17:08:29 +0800 Subject: [PATCH 06/25] refine --- libai/version.py | 2 +- projects/MagicPrompt/configs/gpt2_training.py | 4 +- .../oneflow_magicprompt/config.yaml | 76 + .../events.out.tfevents.1679971553.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1679991467.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1679991969.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1679992167.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1679993905.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1679994348.oneflow-32 | Bin 0 -> 40 bytes .../MagicPrompt/oneflow_magicprompt/log.txt | 3153 +++++++++++++++++ .../oneflow_magicprompt/metrics.json | 0 11 files changed, 3233 insertions(+), 2 deletions(-) create mode 100644 projects/MagicPrompt/oneflow_magicprompt/config.yaml create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679971553.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991467.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991969.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679992167.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679993905.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679994348.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt create mode 100644 projects/MagicPrompt/oneflow_magicprompt/metrics.json diff --git a/libai/version.py b/libai/version.py index 36ee184fb..1b36290b1 100644 --- a/libai/version.py +++ b/libai/version.py @@ -1,2 +1,2 @@ __version__ = '0.2.0' -git_version = '89f63fc3be24e4d921b08656d051e143fcb70ef8' +git_version = 'ec311a739120c1f93edf0dcc86627f61e84bcb02' diff --git a/projects/MagicPrompt/configs/gpt2_training.py b/projects/MagicPrompt/configs/gpt2_training.py index d648d0f52..4f7d5a540 100644 --- a/projects/MagicPrompt/configs/gpt2_training.py +++ b/projects/MagicPrompt/configs/gpt2_training.py @@ -38,7 +38,7 @@ model.cfg.scale_mask_softmax_fusion = False model.cfg.apply_query_key_layer_scaling = True model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = True +model.cfg.amp_enabled = False train.input_placement_device = "cpu" @@ -48,6 +48,8 @@ ds.max_seq_length = model.cfg.max_seq_length optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None train.update( dict( diff --git a/projects/MagicPrompt/oneflow_magicprompt/config.yaml b/projects/MagicPrompt/oneflow_magicprompt/config.yaml new file mode 100644 index 000000000..a4e6c80c8 --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/config.yaml @@ -0,0 +1,76 @@ +dataloader: + train: + _target_: libai.data.build_nlp_train_val_test_loader + dataset: + - _target_: libai.data.datasets.GPT2Dataset + data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence + indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} + max_seq_length: 1024 + name: gpt-2 + seed: 1234 + num_workers: 4 + splits: + - [949.0, 50.0, 1.0] + train_val_test_num_samples: null + weights: [1.0] +ds: + _target_: libai.data.datasets.GPT2Dataset + data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence + indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} + max_seq_length: 1024 + name: gpt-2 + seed: 1234 +graph: + auto_parallel: {enable_auto_parallel_ignore_user_sbp_config: false, enabled: false, sbp_collector: false, trunk_algo: true} + debug: -1 + enabled: true + eval_graph: {_target_: libai.models.utils.GraphBase, is_train: false} + train_graph: {_target_: libai.models.utils.GraphBase, is_train: true} +model: + _target_: projects.MagicPrompt.gpt2.GPTForPreTraining + cfg: {amp_enabled: false, apply_query_key_layer_scaling: true, apply_residual_post_layernorm: false, attention_dropout_prob: 0.1, bias_dropout_fusion: false, bias_gelu_fusion: false, bos_token_id: 50256, chunk_size_feed_forward: 0, decoder_start_token_id: null, diversity_penalty: 0.0, do_sample: false, early_stopping: false, embedding_dropout_prob: 0.1, encoder_no_repeat_ngram_size: 0, eos_token_id: 50256, exponential_decay_length_penalty: null, ffn_hidden_size: 3072, forced_bos_token_id: null, forced_eos_token_id: null, hidden_layers: 12, hidden_size: 768, initializer_range: 0.02, is_encoder_decoder: false, layernorm_epsilon: 1.0e-05, length_penalty: 1.0, max_length: 20, max_seq_length: 1024, min_length: 0, no_repeat_ngram_size: 0, num_attention_heads: 12, num_beam_groups: 1, num_beams: 1, num_return_sequences: 1, output_dropout_prob: 0.1, output_scores: false, pad_token_id: 0, pretrained_model_path: null, remove_invalid_values: false, repetition_penalty: 1.0, scale_mask_softmax_fusion: false, sep_token_id: null, temperature: 1.0, top_k: 50, top_p: 1.0, typical_p: 1.0, use_cache: true, use_scaled_init_for_output_weights: true, vocab_size: 50257} +optim: + _target_: oneflow.nn.optimizer.adamw.AdamW + betas: [0.9, 0.999] + do_bias_correction: true + eps: 1.0e-08 + lr: 5.0e-05 + params: {_target_: libai.optim.get_default_optimizer_params, clip_grad_max_norm: null, clip_grad_norm_type: null, weight_decay_bias: 0.0, weight_decay_norm: 0.0} + weight_decay: 0.01 +tokenization: + append_eod: false + make_vocab_size_divisible_by: 128 + tokenizer: {_target_: libai.tokenizer.GPT2Tokenizer, do_chinese_wwm: true, do_lower_case: true, merges_file: /home/zhangxiaoyu/magicprompt/merges.txt, vocab_file: /home/zhangxiaoyu/magicprompt/vocab.json} +train: + activation_checkpoint: {enabled: false} + amp: {enabled: true} + checkpointer: {max_to_keep: 20, period: 8000} + consumed_train_samples: 0 + consumed_valid_samples: 0 + dist: {data_parallel_size: 1, num_gpus_per_node: 1, num_nodes: 1, pipeline_num_layers: 10000, pipeline_parallel_size: 1, tensor_parallel_size: 1} + evaluation: + enabled: true + eval_iter: 250 + eval_period: 4000 + evaluator: {_target_: libai.evaluation.PPLEvaluator} + global_batch_size: 4 + input_placement_device: cpu + load_weight: '' + log_period: 50 + nccl_fusion_max_ops: 24 + nccl_fusion_threshold_mb: 16 + num_accumulation_steps: 1 + output_dir: projects/MagicPrompt/oneflow_magicprompt + rdma_enabled: false + resume: false + samples: 40000 + scheduler: {_target_: libai.scheduler.WarmupExponentialLR, gamma: 1.0, warmup_factor: 0.0, warmup_iter: 0.0, warmup_method: linear} + seed: 1234 + start_iter: 0 + test_micro_batch_size: 4 + train_epoch: 33 + train_iter: 10000 + train_micro_batch_size: 4 + train_samples: null + warmup_ratio: 0 + zero_optimization: {enabled: false, stage: 1} diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679971553.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679971553.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..7aaa0862212d2e517cbda58fbf0f63914d13c3ea GIT binary patch literal 40 rcmb1OfPlsI-b$PgU%6RK<+$l6#hX-=n3<>NT9%quVr67-U?UFz(f153 literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991467.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991467.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..970f3ea7099529f4beaa136dac5f875acb1123d5 GIT binary patch literal 40 rcmb1OfPlsI-b$Rx4?eA0!Ew`3iZ`h!F*8rkwJbHS#LDQQCD%Iu+T9HM literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991969.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991969.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..115d6489b0c56ef9eb834913f9db25ff12020b19 GIT binary patch literal 40 rcmb1OfPlsI-b$PnuOc*7a@=&3;!P?_%*@ksElbTSu`=qeDcAx4!uSjr literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679992167.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679992167.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..f23eb044039af2e5843d973b3273cfe68efe6ee4 GIT binary patch literal 40 rcmb1OfPlsI-b$QngO5k9NT9%quVr8_7$ND?~&P5C> literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679993905.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679993905.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..c5ac8326cb52c96fd20d9fc76f40d76d2ab16147 GIT binary patch literal 40 rcmb1OfPlsI-b$RGIvjXbbKG>4;!P?_%*@ksElbTSu`+7>>Tv@A$nOlP literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679994348.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679994348.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..0582e12c5094a469dc036407910ba4000679291c GIT binary patch literal 40 rcmb1OfPlsI-b$Q0*Ho)lbKG>4;!P?_%*@ksElbTSu`)7Tv$+re!V(Mc literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt b/projects/MagicPrompt/oneflow_magicprompt/log.txt new file mode 100644 index 000000000..0e76edb0f --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/log.txt @@ -0,0 +1,3153 @@ +[03/28 10:45:49] libai INFO: Rank of current process: 0. World size: 1 +[03/28 10:45:49] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 10:45:49] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = True + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 10:45:49] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 10:45:49] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 10:45:49] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.052 seconds +[03/28 10:45:49] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.053 seconds +[03/28 10:45:51] lb.engine.default INFO: Prepare training, validating, testing set +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.009791 seconds +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 10:45:51] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/28 10:45:51] libai INFO: > Start building model... +[03/28 10:45:51] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/28 10:45:51] libai INFO: >>> done with building model. Building time: 0.161 seconds +[03/28 10:45:51] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/28 10:45:53] lb.engine.trainer INFO: Starting training from iteration 0 +[03/28 10:45:53] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/28 10:45:59] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime + self._c_nn_graph.compile_plan_for_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are + op_name: model.GPT_model.transformer.layers.11.mlp.activation_func-scalar_pow_grad-605 + op_type_name: scalar_pow_grad + DeviceType_Name: kMLU + DataType_Name of dy_0: kFloat + DataType_Name of x_0: kFloat + DataType_Name of dx_0: kFloat +Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name File "/home/zhangxiaoyu/libai/libai/models/utils/graph_base.py", line 107, in build, source < losses.backward() >; File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step, source < loss_dict = self.graph(**data) >; ... 5 more + + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf + InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs + user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) +Error Type: oneflow.ErrorProto.op_kernel_not_found_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/28 10:45:59] lb.engine.hooks INFO: Total training time: 0:00:06 (0:00:00 on hooks) +[03/28 16:17:43] libai INFO: Rank of current process: 0. World size: 1 +[03/28 16:17:43] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 16:17:43] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = True + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 16:17:43] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 16:17:43] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 16:17:43] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.042 seconds +[03/28 16:17:43] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.043 seconds +[03/28 16:17:46] lb.engine.default INFO: Prepare training, validating, testing set +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.010023 seconds +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.007 seconds +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 16:17:46] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/28 16:17:46] libai INFO: > Start building model... +[03/28 16:17:46] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/28 16:17:46] libai INFO: >>> done with building model. Building time: 0.208 seconds +[03/28 16:17:46] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/28 16:17:47] lb.engine.trainer INFO: Starting training from iteration 0 +[03/28 16:17:47] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/28 16:17:54] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime + self._c_nn_graph.compile_plan_for_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are + op_name: System-ClipGradient-GlobalNorm-MultiReduceSumPowAbs-423 + op_type_name: multi_reduce_sum_pow_abs + DeviceType_Name: kMLU + DataType_Name of x_0: kFloat + DataType_Name of x_1: kFloat + DataType_Name of x_2: kFloat + DataType_Name of x_3: kFloat + DataType_Name of x_4: kFloat + DataType_Name of x_5: kFloat + DataType_Name of x_6: kFloat + DataType_Name of x_7: kFloat + DataType_Name of x_8: kFloat + DataType_Name of x_9: kFloat + DataType_Name of x_10: kFloat + DataType_Name of x_11: kFloat + DataType_Name of x_12: kFloat + DataType_Name of x_13: kFloat + DataType_Name of x_14: kFloat + DataType_Name of x_15: kFloat + DataType_Name of x_16: kFloat + DataType_Name of x_17: kFloat + DataType_Name of x_18: kFloat + DataType_Name of x_19: kFloat + DataType_Name of x_20: kFloat + DataType_Name of x_21: kFloat + DataType_Name of x_22: kFloat + DataType_Name of x_23: kFloat + DataType_Name of x_24: kFloat + DataType_Name of x_25: kFloat + DataType_Name of x_26: kFloat + DataType_Name of x_27: kFloat + DataType_Name of x_28: kFloat + DataType_Name of x_29: kFloat + DataType_Name of x_30: kFloat + DataType_Name of x_31: kFloat + DataType_Name of x_32: kFloat + DataType_Name of x_33: kFloat + DataType_Name of x_34: kFloat + DataType_Name of x_35: kFloat + DataType_Name of x_36: kFloat + DataType_Name of x_37: kFloat + DataType_Name of x_38: kFloat + DataType_Name of x_39: kFloat + DataType_Name of x_40: kFloat + DataType_Name of x_41: kFloat + DataType_Name of x_42: kFloat + DataType_Name of x_43: kFloat + DataType_Name of x_44: kFloat + DataType_Name of x_45: kFloat + DataType_Name of x_46: kFloat + DataType_Name of x_47: kFloat + DataType_Name of x_48: kFloat + DataType_Name of x_49: kFloat + DataType_Name of x_50: kFloat + DataType_Name of x_51: kFloat + DataType_Name of x_52: kFloat + DataType_Name of x_53: kFloat + DataType_Name of x_54: kFloat + DataType_Name of x_55: kFloat + DataType_Name of x_56: kFloat + DataType_Name of x_57: kFloat + DataType_Name of x_58: kFloat + DataType_Name of x_59: kFloat + DataType_Name of x_60: kFloat + DataType_Name of x_61: kFloat + DataType_Name of x_62: kFloat + DataType_Name of x_63: kFloat + DataType_Name of x_64: kFloat + DataType_Name of x_65: kFloat + DataType_Name of x_66: kFloat + DataType_Name of x_67: kFloat + DataType_Name of x_68: kFloat + DataType_Name of x_69: kFloat + DataType_Name of x_70: kFloat + DataType_Name of x_71: kFloat + DataType_Name of x_72: kFloat + DataType_Name of x_73: kFloat + DataType_Name of x_74: kFloat + DataType_Name of x_75: kFloat + DataType_Name of x_76: kFloat + DataType_Name of x_77: kFloat + DataType_Name of x_78: kFloat + DataType_Name of x_79: kFloat + DataType_Name of x_80: kFloat + DataType_Name of x_81: kFloat + DataType_Name of x_82: kFloat + DataType_Name of x_83: kFloat + DataType_Name of x_84: kFloat + DataType_Name of x_85: kFloat + DataType_Name of x_86: kFloat + DataType_Name of x_87: kFloat + DataType_Name of x_88: kFloat + DataType_Name of x_89: kFloat + DataType_Name of x_90: kFloat + DataType_Name of x_91: kFloat + DataType_Name of x_92: kFloat + DataType_Name of x_93: kFloat + DataType_Name of x_94: kFloat + DataType_Name of x_95: kFloat + DataType_Name of x_96: kFloat + DataType_Name of x_97: kFloat + DataType_Name of y_0: kFloat +Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name + + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf + InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs + user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) +Error Type: oneflow.ErrorProto.op_kernel_not_found_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/28 16:17:54] lb.engine.hooks INFO: Total training time: 0:00:06 (0:00:00 on hooks) +[03/28 16:26:02] libai INFO: Rank of current process: 0. World size: 1 +[03/28 16:26:02] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 16:26:03] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 16:26:03] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 16:26:03] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 16:26:03] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.309 seconds +[03/28 16:26:03] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.310 seconds +[03/28 16:26:06] lb.engine.default INFO: Prepare training, validating, testing set +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.011565 seconds +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.007 seconds +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 16:26:06] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/28 16:26:06] libai INFO: > Start building model... +[03/28 16:26:07] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/28 16:26:07] libai INFO: >>> done with building model. Building time: 0.790 seconds +[03/28 16:26:07] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/28 16:26:09] lb.engine.trainer INFO: Starting training from iteration 0 +[03/28 16:26:09] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/28 16:26:17] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime + self._c_nn_graph.compile_plan_for_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are + op_name: System-DynamicLossScale-MultiCountNotFinite-366 + op_type_name: multi_count_not_finite + DeviceType_Name: kMLU + DataType_Name of x_0: kFloat + DataType_Name of x_1: kFloat + DataType_Name of x_2: kFloat + DataType_Name of x_3: kFloat + DataType_Name of x_4: kFloat + DataType_Name of x_5: kFloat + DataType_Name of x_6: kFloat + DataType_Name of x_7: kFloat + DataType_Name of x_8: kFloat + DataType_Name of x_9: kFloat + DataType_Name of x_10: kFloat + DataType_Name of x_11: kFloat + DataType_Name of x_12: kFloat + DataType_Name of x_13: kFloat + DataType_Name of x_14: kFloat + DataType_Name of x_15: kFloat + DataType_Name of x_16: kFloat + DataType_Name of x_17: kFloat + DataType_Name of x_18: kFloat + DataType_Name of x_19: kFloat + DataType_Name of x_20: kFloat + DataType_Name of x_21: kFloat + DataType_Name of x_22: kFloat + DataType_Name of x_23: kFloat + DataType_Name of x_24: kFloat + DataType_Name of x_25: kFloat + DataType_Name of x_26: kFloat + DataType_Name of x_27: kFloat + DataType_Name of x_28: kFloat + DataType_Name of x_29: kFloat + DataType_Name of x_30: kFloat + DataType_Name of x_31: kFloat + DataType_Name of x_32: kFloat + DataType_Name of x_33: kFloat + DataType_Name of x_34: kFloat + DataType_Name of x_35: kFloat + DataType_Name of x_36: kFloat + DataType_Name of x_37: kFloat + DataType_Name of x_38: kFloat + DataType_Name of x_39: kFloat + DataType_Name of x_40: kFloat + DataType_Name of x_41: kFloat + DataType_Name of x_42: kFloat + DataType_Name of x_43: kFloat + DataType_Name of x_44: kFloat + DataType_Name of x_45: kFloat + DataType_Name of x_46: kFloat + DataType_Name of x_47: kFloat + DataType_Name of x_48: kFloat + DataType_Name of x_49: kFloat + DataType_Name of x_50: kFloat + DataType_Name of x_51: kFloat + DataType_Name of x_52: kFloat + DataType_Name of x_53: kFloat + DataType_Name of x_54: kFloat + DataType_Name of x_55: kFloat + DataType_Name of x_56: kFloat + DataType_Name of x_57: kFloat + DataType_Name of x_58: kFloat + DataType_Name of x_59: kFloat + DataType_Name of x_60: kFloat + DataType_Name of x_61: kFloat + DataType_Name of x_62: kFloat + DataType_Name of x_63: kFloat + DataType_Name of x_64: kFloat + DataType_Name of x_65: kFloat + DataType_Name of x_66: kFloat + DataType_Name of x_67: kFloat + DataType_Name of x_68: kFloat + DataType_Name of x_69: kFloat + DataType_Name of x_70: kFloat + DataType_Name of x_71: kFloat + DataType_Name of x_72: kFloat + DataType_Name of x_73: kFloat + DataType_Name of x_74: kFloat + DataType_Name of x_75: kFloat + DataType_Name of x_76: kFloat + DataType_Name of x_77: kFloat + DataType_Name of x_78: kFloat + DataType_Name of x_79: kFloat + DataType_Name of x_80: kFloat + DataType_Name of x_81: kFloat + DataType_Name of x_82: kFloat + DataType_Name of x_83: kFloat + DataType_Name of x_84: kFloat + DataType_Name of x_85: kFloat + DataType_Name of x_86: kFloat + DataType_Name of x_87: kFloat + DataType_Name of x_88: kFloat + DataType_Name of x_89: kFloat + DataType_Name of x_90: kFloat + DataType_Name of x_91: kFloat + DataType_Name of x_92: kFloat + DataType_Name of x_93: kFloat + DataType_Name of x_94: kFloat + DataType_Name of x_95: kFloat + DataType_Name of x_96: kFloat + DataType_Name of x_97: kFloat + DataType_Name of x_98: kFloat + DataType_Name of x_99: kFloat + DataType_Name of x_100: kFloat + DataType_Name of x_101: kFloat + DataType_Name of x_102: kFloat + DataType_Name of x_103: kFloat + DataType_Name of x_104: kFloat + DataType_Name of x_105: kFloat + DataType_Name of x_106: kFloat + DataType_Name of x_107: kFloat + DataType_Name of x_108: kFloat + DataType_Name of x_109: kFloat + DataType_Name of x_110: kFloat + DataType_Name of x_111: kFloat + DataType_Name of x_112: kFloat + DataType_Name of x_113: kFloat + DataType_Name of x_114: kFloat + DataType_Name of x_115: kFloat + DataType_Name of x_116: kFloat + DataType_Name of x_117: kFloat + DataType_Name of x_118: kFloat + DataType_Name of x_119: kFloat + DataType_Name of x_120: kFloat + DataType_Name of x_121: kFloat + DataType_Name of x_122: kFloat + DataType_Name of x_123: kFloat + DataType_Name of x_124: kFloat + DataType_Name of x_125: kFloat + DataType_Name of x_126: kFloat + DataType_Name of x_127: kFloat + DataType_Name of x_128: kFloat + DataType_Name of x_129: kFloat + DataType_Name of x_130: kFloat + DataType_Name of x_131: kFloat + DataType_Name of x_132: kFloat + DataType_Name of x_133: kFloat + DataType_Name of x_134: kFloat + DataType_Name of x_135: kFloat + DataType_Name of x_136: kFloat + DataType_Name of x_137: kFloat + DataType_Name of x_138: kFloat + DataType_Name of x_139: kFloat + DataType_Name of x_140: kFloat + DataType_Name of x_141: kFloat + DataType_Name of x_142: kFloat + DataType_Name of x_143: kFloat + DataType_Name of x_144: kFloat + DataType_Name of x_145: kFloat + DataType_Name of x_146: kFloat + DataType_Name of x_147: kFloat + DataType_Name of y_0: kInt64 +Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name + + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf + InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs + user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) +Error Type: oneflow.ErrorProto.op_kernel_not_found_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/28 16:26:17] lb.engine.hooks INFO: Total training time: 0:00:08 (0:00:00 on hooks) +[03/28 16:29:20] libai INFO: Rank of current process: 0. World size: 1 +[03/28 16:29:20] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 16:29:20] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 16:29:20] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 16:29:20] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 16:29:20] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.187 seconds +[03/28 16:29:20] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.187 seconds +[03/28 16:29:24] lb.engine.default INFO: Prepare training, validating, testing set +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.011708 seconds +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.009 seconds +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.002 seconds +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 16:29:24] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/28 16:29:24] libai INFO: > Start building model... +[03/28 16:29:25] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/28 16:29:25] libai INFO: >>> done with building model. Building time: 0.518 seconds +[03/28 16:29:25] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/28 16:29:27] lb.engine.trainer INFO: Starting training from iteration 0 +[03/28 16:29:27] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/28 16:29:37] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime + self._c_nn_graph.compile_plan_for_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are + op_name: System-DynamicLossScale-MultiCountNotFinite-366 + op_type_name: multi_count_not_finite + DeviceType_Name: kMLU + DataType_Name of x_0: kFloat + DataType_Name of x_1: kFloat + DataType_Name of x_2: kFloat + DataType_Name of x_3: kFloat + DataType_Name of x_4: kFloat + DataType_Name of x_5: kFloat + DataType_Name of x_6: kFloat + DataType_Name of x_7: kFloat + DataType_Name of x_8: kFloat + DataType_Name of x_9: kFloat + DataType_Name of x_10: kFloat + DataType_Name of x_11: kFloat + DataType_Name of x_12: kFloat + DataType_Name of x_13: kFloat + DataType_Name of x_14: kFloat + DataType_Name of x_15: kFloat + DataType_Name of x_16: kFloat + DataType_Name of x_17: kFloat + DataType_Name of x_18: kFloat + DataType_Name of x_19: kFloat + DataType_Name of x_20: kFloat + DataType_Name of x_21: kFloat + DataType_Name of x_22: kFloat + DataType_Name of x_23: kFloat + DataType_Name of x_24: kFloat + DataType_Name of x_25: kFloat + DataType_Name of x_26: kFloat + DataType_Name of x_27: kFloat + DataType_Name of x_28: kFloat + DataType_Name of x_29: kFloat + DataType_Name of x_30: kFloat + DataType_Name of x_31: kFloat + DataType_Name of x_32: kFloat + DataType_Name of x_33: kFloat + DataType_Name of x_34: kFloat + DataType_Name of x_35: kFloat + DataType_Name of x_36: kFloat + DataType_Name of x_37: kFloat + DataType_Name of x_38: kFloat + DataType_Name of x_39: kFloat + DataType_Name of x_40: kFloat + DataType_Name of x_41: kFloat + DataType_Name of x_42: kFloat + DataType_Name of x_43: kFloat + DataType_Name of x_44: kFloat + DataType_Name of x_45: kFloat + DataType_Name of x_46: kFloat + DataType_Name of x_47: kFloat + DataType_Name of x_48: kFloat + DataType_Name of x_49: kFloat + DataType_Name of x_50: kFloat + DataType_Name of x_51: kFloat + DataType_Name of x_52: kFloat + DataType_Name of x_53: kFloat + DataType_Name of x_54: kFloat + DataType_Name of x_55: kFloat + DataType_Name of x_56: kFloat + DataType_Name of x_57: kFloat + DataType_Name of x_58: kFloat + DataType_Name of x_59: kFloat + DataType_Name of x_60: kFloat + DataType_Name of x_61: kFloat + DataType_Name of x_62: kFloat + DataType_Name of x_63: kFloat + DataType_Name of x_64: kFloat + DataType_Name of x_65: kFloat + DataType_Name of x_66: kFloat + DataType_Name of x_67: kFloat + DataType_Name of x_68: kFloat + DataType_Name of x_69: kFloat + DataType_Name of x_70: kFloat + DataType_Name of x_71: kFloat + DataType_Name of x_72: kFloat + DataType_Name of x_73: kFloat + DataType_Name of x_74: kFloat + DataType_Name of x_75: kFloat + DataType_Name of x_76: kFloat + DataType_Name of x_77: kFloat + DataType_Name of x_78: kFloat + DataType_Name of x_79: kFloat + DataType_Name of x_80: kFloat + DataType_Name of x_81: kFloat + DataType_Name of x_82: kFloat + DataType_Name of x_83: kFloat + DataType_Name of x_84: kFloat + DataType_Name of x_85: kFloat + DataType_Name of x_86: kFloat + DataType_Name of x_87: kFloat + DataType_Name of x_88: kFloat + DataType_Name of x_89: kFloat + DataType_Name of x_90: kFloat + DataType_Name of x_91: kFloat + DataType_Name of x_92: kFloat + DataType_Name of x_93: kFloat + DataType_Name of x_94: kFloat + DataType_Name of x_95: kFloat + DataType_Name of x_96: kFloat + DataType_Name of x_97: kFloat + DataType_Name of x_98: kFloat + DataType_Name of x_99: kFloat + DataType_Name of x_100: kFloat + DataType_Name of x_101: kFloat + DataType_Name of x_102: kFloat + DataType_Name of x_103: kFloat + DataType_Name of x_104: kFloat + DataType_Name of x_105: kFloat + DataType_Name of x_106: kFloat + DataType_Name of x_107: kFloat + DataType_Name of x_108: kFloat + DataType_Name of x_109: kFloat + DataType_Name of x_110: kFloat + DataType_Name of x_111: kFloat + DataType_Name of x_112: kFloat + DataType_Name of x_113: kFloat + DataType_Name of x_114: kFloat + DataType_Name of x_115: kFloat + DataType_Name of x_116: kFloat + DataType_Name of x_117: kFloat + DataType_Name of x_118: kFloat + DataType_Name of x_119: kFloat + DataType_Name of x_120: kFloat + DataType_Name of x_121: kFloat + DataType_Name of x_122: kFloat + DataType_Name of x_123: kFloat + DataType_Name of x_124: kFloat + DataType_Name of x_125: kFloat + DataType_Name of x_126: kFloat + DataType_Name of x_127: kFloat + DataType_Name of x_128: kFloat + DataType_Name of x_129: kFloat + DataType_Name of x_130: kFloat + DataType_Name of x_131: kFloat + DataType_Name of x_132: kFloat + DataType_Name of x_133: kFloat + DataType_Name of x_134: kFloat + DataType_Name of x_135: kFloat + DataType_Name of x_136: kFloat + DataType_Name of x_137: kFloat + DataType_Name of x_138: kFloat + DataType_Name of x_139: kFloat + DataType_Name of x_140: kFloat + DataType_Name of x_141: kFloat + DataType_Name of x_142: kFloat + DataType_Name of x_143: kFloat + DataType_Name of x_144: kFloat + DataType_Name of x_145: kFloat + DataType_Name of x_146: kFloat + DataType_Name of x_147: kFloat + DataType_Name of y_0: kInt64 +Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name + + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf + InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs + user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) +Error Type: oneflow.ErrorProto.op_kernel_not_found_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/28 16:29:37] lb.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) +[03/28 16:58:19] libai INFO: Rank of current process: 0. World size: 1 +[03/28 16:58:19] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 16:58:19] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 16:58:19] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 16:58:19] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 16:58:19] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds +[03/28 16:58:19] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.056 seconds +[03/28 16:58:22] lb.engine.default INFO: Prepare training, validating, testing set +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008211 seconds +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 16:58:22] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/28 16:58:22] libai INFO: > Start building model... +[03/28 16:58:23] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/28 16:58:23] libai INFO: >>> done with building model. Building time: 0.891 seconds +[03/28 16:58:23] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/28 16:58:25] lb.engine.trainer INFO: Starting training from iteration 0 +[03/28 16:58:25] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/28 16:58:31] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime + self._c_nn_graph.compile_plan_for_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are + op_name: System-DynamicLossScale-MultiCountNotFinite-366 + op_type_name: multi_count_not_finite + DeviceType_Name: kMLU + DataType_Name of x_0: kFloat + DataType_Name of x_1: kFloat + DataType_Name of x_2: kFloat + DataType_Name of x_3: kFloat + DataType_Name of x_4: kFloat + DataType_Name of x_5: kFloat + DataType_Name of x_6: kFloat + DataType_Name of x_7: kFloat + DataType_Name of x_8: kFloat + DataType_Name of x_9: kFloat + DataType_Name of x_10: kFloat + DataType_Name of x_11: kFloat + DataType_Name of x_12: kFloat + DataType_Name of x_13: kFloat + DataType_Name of x_14: kFloat + DataType_Name of x_15: kFloat + DataType_Name of x_16: kFloat + DataType_Name of x_17: kFloat + DataType_Name of x_18: kFloat + DataType_Name of x_19: kFloat + DataType_Name of x_20: kFloat + DataType_Name of x_21: kFloat + DataType_Name of x_22: kFloat + DataType_Name of x_23: kFloat + DataType_Name of x_24: kFloat + DataType_Name of x_25: kFloat + DataType_Name of x_26: kFloat + DataType_Name of x_27: kFloat + DataType_Name of x_28: kFloat + DataType_Name of x_29: kFloat + DataType_Name of x_30: kFloat + DataType_Name of x_31: kFloat + DataType_Name of x_32: kFloat + DataType_Name of x_33: kFloat + DataType_Name of x_34: kFloat + DataType_Name of x_35: kFloat + DataType_Name of x_36: kFloat + DataType_Name of x_37: kFloat + DataType_Name of x_38: kFloat + DataType_Name of x_39: kFloat + DataType_Name of x_40: kFloat + DataType_Name of x_41: kFloat + DataType_Name of x_42: kFloat + DataType_Name of x_43: kFloat + DataType_Name of x_44: kFloat + DataType_Name of x_45: kFloat + DataType_Name of x_46: kFloat + DataType_Name of x_47: kFloat + DataType_Name of x_48: kFloat + DataType_Name of x_49: kFloat + DataType_Name of x_50: kFloat + DataType_Name of x_51: kFloat + DataType_Name of x_52: kFloat + DataType_Name of x_53: kFloat + DataType_Name of x_54: kFloat + DataType_Name of x_55: kFloat + DataType_Name of x_56: kFloat + DataType_Name of x_57: kFloat + DataType_Name of x_58: kFloat + DataType_Name of x_59: kFloat + DataType_Name of x_60: kFloat + DataType_Name of x_61: kFloat + DataType_Name of x_62: kFloat + DataType_Name of x_63: kFloat + DataType_Name of x_64: kFloat + DataType_Name of x_65: kFloat + DataType_Name of x_66: kFloat + DataType_Name of x_67: kFloat + DataType_Name of x_68: kFloat + DataType_Name of x_69: kFloat + DataType_Name of x_70: kFloat + DataType_Name of x_71: kFloat + DataType_Name of x_72: kFloat + DataType_Name of x_73: kFloat + DataType_Name of x_74: kFloat + DataType_Name of x_75: kFloat + DataType_Name of x_76: kFloat + DataType_Name of x_77: kFloat + DataType_Name of x_78: kFloat + DataType_Name of x_79: kFloat + DataType_Name of x_80: kFloat + DataType_Name of x_81: kFloat + DataType_Name of x_82: kFloat + DataType_Name of x_83: kFloat + DataType_Name of x_84: kFloat + DataType_Name of x_85: kFloat + DataType_Name of x_86: kFloat + DataType_Name of x_87: kFloat + DataType_Name of x_88: kFloat + DataType_Name of x_89: kFloat + DataType_Name of x_90: kFloat + DataType_Name of x_91: kFloat + DataType_Name of x_92: kFloat + DataType_Name of x_93: kFloat + DataType_Name of x_94: kFloat + DataType_Name of x_95: kFloat + DataType_Name of x_96: kFloat + DataType_Name of x_97: kFloat + DataType_Name of x_98: kFloat + DataType_Name of x_99: kFloat + DataType_Name of x_100: kFloat + DataType_Name of x_101: kFloat + DataType_Name of x_102: kFloat + DataType_Name of x_103: kFloat + DataType_Name of x_104: kFloat + DataType_Name of x_105: kFloat + DataType_Name of x_106: kFloat + DataType_Name of x_107: kFloat + DataType_Name of x_108: kFloat + DataType_Name of x_109: kFloat + DataType_Name of x_110: kFloat + DataType_Name of x_111: kFloat + DataType_Name of x_112: kFloat + DataType_Name of x_113: kFloat + DataType_Name of x_114: kFloat + DataType_Name of x_115: kFloat + DataType_Name of x_116: kFloat + DataType_Name of x_117: kFloat + DataType_Name of x_118: kFloat + DataType_Name of x_119: kFloat + DataType_Name of x_120: kFloat + DataType_Name of x_121: kFloat + DataType_Name of x_122: kFloat + DataType_Name of x_123: kFloat + DataType_Name of x_124: kFloat + DataType_Name of x_125: kFloat + DataType_Name of x_126: kFloat + DataType_Name of x_127: kFloat + DataType_Name of x_128: kFloat + DataType_Name of x_129: kFloat + DataType_Name of x_130: kFloat + DataType_Name of x_131: kFloat + DataType_Name of x_132: kFloat + DataType_Name of x_133: kFloat + DataType_Name of x_134: kFloat + DataType_Name of x_135: kFloat + DataType_Name of x_136: kFloat + DataType_Name of x_137: kFloat + DataType_Name of x_138: kFloat + DataType_Name of x_139: kFloat + DataType_Name of x_140: kFloat + DataType_Name of x_141: kFloat + DataType_Name of x_142: kFloat + DataType_Name of x_143: kFloat + DataType_Name of x_144: kFloat + DataType_Name of x_145: kFloat + DataType_Name of x_146: kFloat + DataType_Name of x_147: kFloat + DataType_Name of y_0: kInt64 +Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name + + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf + InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs + user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) +Error Type: oneflow.ErrorProto.op_kernel_not_found_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/28 16:58:31] lb.engine.hooks INFO: Total training time: 0:00:06 (0:00:00 on hooks) +[03/28 17:05:44] libai INFO: Rank of current process: 0. World size: 1 +[03/28 17:05:44] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/28 17:05:44] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=50, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=2 * model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/28 17:05:44] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/28 17:05:44] lb.engine.default INFO: > compiling dataset index builder ... +[03/28 17:05:44] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.043 seconds +[03/28 17:05:44] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.044 seconds +[03/28 17:05:46] lb.engine.default INFO: Prepare training, validating, testing set +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007315 seconds +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/28 17:05:47] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/28 17:05:47] libai INFO: > Start building model... +[03/28 17:05:47] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/28 17:05:47] libai INFO: >>> done with building model. Building time: 0.294 seconds +[03/28 17:05:47] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/28 17:05:48] lb.engine.trainer INFO: Starting training from iteration 0 +[03/28 17:05:48] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/28 17:05:54] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime + self._c_nn_graph.compile_plan_for_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are + op_name: System-DynamicLossScale-MultiCountNotFinite-366 + op_type_name: multi_count_not_finite + DeviceType_Name: kMLU + DataType_Name of x_0: kFloat + DataType_Name of x_1: kFloat + DataType_Name of x_2: kFloat + DataType_Name of x_3: kFloat + DataType_Name of x_4: kFloat + DataType_Name of x_5: kFloat + DataType_Name of x_6: kFloat + DataType_Name of x_7: kFloat + DataType_Name of x_8: kFloat + DataType_Name of x_9: kFloat + DataType_Name of x_10: kFloat + DataType_Name of x_11: kFloat + DataType_Name of x_12: kFloat + DataType_Name of x_13: kFloat + DataType_Name of x_14: kFloat + DataType_Name of x_15: kFloat + DataType_Name of x_16: kFloat + DataType_Name of x_17: kFloat + DataType_Name of x_18: kFloat + DataType_Name of x_19: kFloat + DataType_Name of x_20: kFloat + DataType_Name of x_21: kFloat + DataType_Name of x_22: kFloat + DataType_Name of x_23: kFloat + DataType_Name of x_24: kFloat + DataType_Name of x_25: kFloat + DataType_Name of x_26: kFloat + DataType_Name of x_27: kFloat + DataType_Name of x_28: kFloat + DataType_Name of x_29: kFloat + DataType_Name of x_30: kFloat + DataType_Name of x_31: kFloat + DataType_Name of x_32: kFloat + DataType_Name of x_33: kFloat + DataType_Name of x_34: kFloat + DataType_Name of x_35: kFloat + DataType_Name of x_36: kFloat + DataType_Name of x_37: kFloat + DataType_Name of x_38: kFloat + DataType_Name of x_39: kFloat + DataType_Name of x_40: kFloat + DataType_Name of x_41: kFloat + DataType_Name of x_42: kFloat + DataType_Name of x_43: kFloat + DataType_Name of x_44: kFloat + DataType_Name of x_45: kFloat + DataType_Name of x_46: kFloat + DataType_Name of x_47: kFloat + DataType_Name of x_48: kFloat + DataType_Name of x_49: kFloat + DataType_Name of x_50: kFloat + DataType_Name of x_51: kFloat + DataType_Name of x_52: kFloat + DataType_Name of x_53: kFloat + DataType_Name of x_54: kFloat + DataType_Name of x_55: kFloat + DataType_Name of x_56: kFloat + DataType_Name of x_57: kFloat + DataType_Name of x_58: kFloat + DataType_Name of x_59: kFloat + DataType_Name of x_60: kFloat + DataType_Name of x_61: kFloat + DataType_Name of x_62: kFloat + DataType_Name of x_63: kFloat + DataType_Name of x_64: kFloat + DataType_Name of x_65: kFloat + DataType_Name of x_66: kFloat + DataType_Name of x_67: kFloat + DataType_Name of x_68: kFloat + DataType_Name of x_69: kFloat + DataType_Name of x_70: kFloat + DataType_Name of x_71: kFloat + DataType_Name of x_72: kFloat + DataType_Name of x_73: kFloat + DataType_Name of x_74: kFloat + DataType_Name of x_75: kFloat + DataType_Name of x_76: kFloat + DataType_Name of x_77: kFloat + DataType_Name of x_78: kFloat + DataType_Name of x_79: kFloat + DataType_Name of x_80: kFloat + DataType_Name of x_81: kFloat + DataType_Name of x_82: kFloat + DataType_Name of x_83: kFloat + DataType_Name of x_84: kFloat + DataType_Name of x_85: kFloat + DataType_Name of x_86: kFloat + DataType_Name of x_87: kFloat + DataType_Name of x_88: kFloat + DataType_Name of x_89: kFloat + DataType_Name of x_90: kFloat + DataType_Name of x_91: kFloat + DataType_Name of x_92: kFloat + DataType_Name of x_93: kFloat + DataType_Name of x_94: kFloat + DataType_Name of x_95: kFloat + DataType_Name of x_96: kFloat + DataType_Name of x_97: kFloat + DataType_Name of x_98: kFloat + DataType_Name of x_99: kFloat + DataType_Name of x_100: kFloat + DataType_Name of x_101: kFloat + DataType_Name of x_102: kFloat + DataType_Name of x_103: kFloat + DataType_Name of x_104: kFloat + DataType_Name of x_105: kFloat + DataType_Name of x_106: kFloat + DataType_Name of x_107: kFloat + DataType_Name of x_108: kFloat + DataType_Name of x_109: kFloat + DataType_Name of x_110: kFloat + DataType_Name of x_111: kFloat + DataType_Name of x_112: kFloat + DataType_Name of x_113: kFloat + DataType_Name of x_114: kFloat + DataType_Name of x_115: kFloat + DataType_Name of x_116: kFloat + DataType_Name of x_117: kFloat + DataType_Name of x_118: kFloat + DataType_Name of x_119: kFloat + DataType_Name of x_120: kFloat + DataType_Name of x_121: kFloat + DataType_Name of x_122: kFloat + DataType_Name of x_123: kFloat + DataType_Name of x_124: kFloat + DataType_Name of x_125: kFloat + DataType_Name of x_126: kFloat + DataType_Name of x_127: kFloat + DataType_Name of x_128: kFloat + DataType_Name of x_129: kFloat + DataType_Name of x_130: kFloat + DataType_Name of x_131: kFloat + DataType_Name of x_132: kFloat + DataType_Name of x_133: kFloat + DataType_Name of x_134: kFloat + DataType_Name of x_135: kFloat + DataType_Name of x_136: kFloat + DataType_Name of x_137: kFloat + DataType_Name of x_138: kFloat + DataType_Name of x_139: kFloat + DataType_Name of x_140: kFloat + DataType_Name of x_141: kFloat + DataType_Name of x_142: kFloat + DataType_Name of x_143: kFloat + DataType_Name of x_144: kFloat + DataType_Name of x_145: kFloat + DataType_Name of x_146: kFloat + DataType_Name of x_147: kFloat + DataType_Name of y_0: kInt64 +Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name + + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs + op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf + InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs + user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) +Error Type: oneflow.ErrorProto.op_kernel_not_found_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/28 17:05:54] lb.engine.hooks INFO: Total training time: 0:00:06 (0:00:00 on hooks) diff --git a/projects/MagicPrompt/oneflow_magicprompt/metrics.json b/projects/MagicPrompt/oneflow_magicprompt/metrics.json new file mode 100644 index 000000000..e69de29bb From d47fe9c7908c1e7a4604574b2bd58a7e06d20645 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Tue, 28 Mar 2023 17:08:49 +0800 Subject: [PATCH 07/25] refine --- .../oneflow_magicprompt/config.yaml | 76 - .../events.out.tfevents.1679971553.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1679991467.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1679991969.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1679992167.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1679993905.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1679994348.oneflow-32 | Bin 40 -> 0 bytes .../MagicPrompt/oneflow_magicprompt/log.txt | 3153 ----------------- .../oneflow_magicprompt/metrics.json | 0 9 files changed, 3229 deletions(-) delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/config.yaml delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679971553.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991467.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991969.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679992167.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679993905.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679994348.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/metrics.json diff --git a/projects/MagicPrompt/oneflow_magicprompt/config.yaml b/projects/MagicPrompt/oneflow_magicprompt/config.yaml deleted file mode 100644 index a4e6c80c8..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/config.yaml +++ /dev/null @@ -1,76 +0,0 @@ -dataloader: - train: - _target_: libai.data.build_nlp_train_val_test_loader - dataset: - - _target_: libai.data.datasets.GPT2Dataset - data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence - indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} - max_seq_length: 1024 - name: gpt-2 - seed: 1234 - num_workers: 4 - splits: - - [949.0, 50.0, 1.0] - train_val_test_num_samples: null - weights: [1.0] -ds: - _target_: libai.data.datasets.GPT2Dataset - data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence - indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} - max_seq_length: 1024 - name: gpt-2 - seed: 1234 -graph: - auto_parallel: {enable_auto_parallel_ignore_user_sbp_config: false, enabled: false, sbp_collector: false, trunk_algo: true} - debug: -1 - enabled: true - eval_graph: {_target_: libai.models.utils.GraphBase, is_train: false} - train_graph: {_target_: libai.models.utils.GraphBase, is_train: true} -model: - _target_: projects.MagicPrompt.gpt2.GPTForPreTraining - cfg: {amp_enabled: false, apply_query_key_layer_scaling: true, apply_residual_post_layernorm: false, attention_dropout_prob: 0.1, bias_dropout_fusion: false, bias_gelu_fusion: false, bos_token_id: 50256, chunk_size_feed_forward: 0, decoder_start_token_id: null, diversity_penalty: 0.0, do_sample: false, early_stopping: false, embedding_dropout_prob: 0.1, encoder_no_repeat_ngram_size: 0, eos_token_id: 50256, exponential_decay_length_penalty: null, ffn_hidden_size: 3072, forced_bos_token_id: null, forced_eos_token_id: null, hidden_layers: 12, hidden_size: 768, initializer_range: 0.02, is_encoder_decoder: false, layernorm_epsilon: 1.0e-05, length_penalty: 1.0, max_length: 20, max_seq_length: 1024, min_length: 0, no_repeat_ngram_size: 0, num_attention_heads: 12, num_beam_groups: 1, num_beams: 1, num_return_sequences: 1, output_dropout_prob: 0.1, output_scores: false, pad_token_id: 0, pretrained_model_path: null, remove_invalid_values: false, repetition_penalty: 1.0, scale_mask_softmax_fusion: false, sep_token_id: null, temperature: 1.0, top_k: 50, top_p: 1.0, typical_p: 1.0, use_cache: true, use_scaled_init_for_output_weights: true, vocab_size: 50257} -optim: - _target_: oneflow.nn.optimizer.adamw.AdamW - betas: [0.9, 0.999] - do_bias_correction: true - eps: 1.0e-08 - lr: 5.0e-05 - params: {_target_: libai.optim.get_default_optimizer_params, clip_grad_max_norm: null, clip_grad_norm_type: null, weight_decay_bias: 0.0, weight_decay_norm: 0.0} - weight_decay: 0.01 -tokenization: - append_eod: false - make_vocab_size_divisible_by: 128 - tokenizer: {_target_: libai.tokenizer.GPT2Tokenizer, do_chinese_wwm: true, do_lower_case: true, merges_file: /home/zhangxiaoyu/magicprompt/merges.txt, vocab_file: /home/zhangxiaoyu/magicprompt/vocab.json} -train: - activation_checkpoint: {enabled: false} - amp: {enabled: true} - checkpointer: {max_to_keep: 20, period: 8000} - consumed_train_samples: 0 - consumed_valid_samples: 0 - dist: {data_parallel_size: 1, num_gpus_per_node: 1, num_nodes: 1, pipeline_num_layers: 10000, pipeline_parallel_size: 1, tensor_parallel_size: 1} - evaluation: - enabled: true - eval_iter: 250 - eval_period: 4000 - evaluator: {_target_: libai.evaluation.PPLEvaluator} - global_batch_size: 4 - input_placement_device: cpu - load_weight: '' - log_period: 50 - nccl_fusion_max_ops: 24 - nccl_fusion_threshold_mb: 16 - num_accumulation_steps: 1 - output_dir: projects/MagicPrompt/oneflow_magicprompt - rdma_enabled: false - resume: false - samples: 40000 - scheduler: {_target_: libai.scheduler.WarmupExponentialLR, gamma: 1.0, warmup_factor: 0.0, warmup_iter: 0.0, warmup_method: linear} - seed: 1234 - start_iter: 0 - test_micro_batch_size: 4 - train_epoch: 33 - train_iter: 10000 - train_micro_batch_size: 4 - train_samples: null - warmup_ratio: 0 - zero_optimization: {enabled: false, stage: 1} diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679971553.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679971553.oneflow-32 deleted file mode 100644 index 7aaa0862212d2e517cbda58fbf0f63914d13c3ea..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$PgU%6RK<+$l6#hX-=n3<>NT9%quVr67-U?UFz(f153 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991467.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991467.oneflow-32 deleted file mode 100644 index 970f3ea7099529f4beaa136dac5f875acb1123d5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$Rx4?eA0!Ew`3iZ`h!F*8rkwJbHS#LDQQCD%Iu+T9HM diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991969.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679991969.oneflow-32 deleted file mode 100644 index 115d6489b0c56ef9eb834913f9db25ff12020b19..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$PnuOc*7a@=&3;!P?_%*@ksElbTSu`=qeDcAx4!uSjr diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679992167.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679992167.oneflow-32 deleted file mode 100644 index f23eb044039af2e5843d973b3273cfe68efe6ee4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$QngO5k9NT9%quVr8_7$ND?~&P5C> diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679993905.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679993905.oneflow-32 deleted file mode 100644 index c5ac8326cb52c96fd20d9fc76f40d76d2ab16147..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$RGIvjXbbKG>4;!P?_%*@ksElbTSu`+7>>Tv@A$nOlP diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679994348.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1679994348.oneflow-32 deleted file mode 100644 index 0582e12c5094a469dc036407910ba4000679291c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$Q0*Ho)lbKG>4;!P?_%*@ksElbTSu`)7Tv$+re!V(Mc diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt b/projects/MagicPrompt/oneflow_magicprompt/log.txt deleted file mode 100644 index 0e76edb0f..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/log.txt +++ /dev/null @@ -1,3153 +0,0 @@ -[03/28 10:45:49] libai INFO: Rank of current process: 0. World size: 1 -[03/28 10:45:49] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 10:45:49] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = True - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 10:45:49] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 10:45:49] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 10:45:49] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.052 seconds -[03/28 10:45:49] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.053 seconds -[03/28 10:45:51] lb.engine.default INFO: Prepare training, validating, testing set -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.009791 seconds -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/28 10:45:51] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 10:45:51] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 10:45:51] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/28 10:45:51] libai INFO: > Start building model... -[03/28 10:45:51] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/28 10:45:51] libai INFO: >>> done with building model. Building time: 0.161 seconds -[03/28 10:45:51] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/28 10:45:53] lb.engine.trainer INFO: Starting training from iteration 0 -[03/28 10:45:53] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/28 10:45:59] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime - self._c_nn_graph.compile_plan_for_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are - op_name: model.GPT_model.transformer.layers.11.mlp.activation_func-scalar_pow_grad-605 - op_type_name: scalar_pow_grad - DeviceType_Name: kMLU - DataType_Name of dy_0: kFloat - DataType_Name of x_0: kFloat - DataType_Name of dx_0: kFloat -Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name File "/home/zhangxiaoyu/libai/libai/models/utils/graph_base.py", line 107, in build, source < losses.backward() >; File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step, source < loss_dict = self.graph(**data) >; ... 5 more - - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf - InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs - user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) -Error Type: oneflow.ErrorProto.op_kernel_not_found_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/28 10:45:59] lb.engine.hooks INFO: Total training time: 0:00:06 (0:00:00 on hooks) -[03/28 16:17:43] libai INFO: Rank of current process: 0. World size: 1 -[03/28 16:17:43] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 16:17:43] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = True - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 16:17:43] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 16:17:43] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 16:17:43] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.042 seconds -[03/28 16:17:43] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.043 seconds -[03/28 16:17:46] lb.engine.default INFO: Prepare training, validating, testing set -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.010023 seconds -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/28 16:17:46] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.007 seconds -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 16:17:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 16:17:46] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/28 16:17:46] libai INFO: > Start building model... -[03/28 16:17:46] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/28 16:17:46] libai INFO: >>> done with building model. Building time: 0.208 seconds -[03/28 16:17:46] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/28 16:17:47] lb.engine.trainer INFO: Starting training from iteration 0 -[03/28 16:17:47] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/28 16:17:54] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime - self._c_nn_graph.compile_plan_for_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are - op_name: System-ClipGradient-GlobalNorm-MultiReduceSumPowAbs-423 - op_type_name: multi_reduce_sum_pow_abs - DeviceType_Name: kMLU - DataType_Name of x_0: kFloat - DataType_Name of x_1: kFloat - DataType_Name of x_2: kFloat - DataType_Name of x_3: kFloat - DataType_Name of x_4: kFloat - DataType_Name of x_5: kFloat - DataType_Name of x_6: kFloat - DataType_Name of x_7: kFloat - DataType_Name of x_8: kFloat - DataType_Name of x_9: kFloat - DataType_Name of x_10: kFloat - DataType_Name of x_11: kFloat - DataType_Name of x_12: kFloat - DataType_Name of x_13: kFloat - DataType_Name of x_14: kFloat - DataType_Name of x_15: kFloat - DataType_Name of x_16: kFloat - DataType_Name of x_17: kFloat - DataType_Name of x_18: kFloat - DataType_Name of x_19: kFloat - DataType_Name of x_20: kFloat - DataType_Name of x_21: kFloat - DataType_Name of x_22: kFloat - DataType_Name of x_23: kFloat - DataType_Name of x_24: kFloat - DataType_Name of x_25: kFloat - DataType_Name of x_26: kFloat - DataType_Name of x_27: kFloat - DataType_Name of x_28: kFloat - DataType_Name of x_29: kFloat - DataType_Name of x_30: kFloat - DataType_Name of x_31: kFloat - DataType_Name of x_32: kFloat - DataType_Name of x_33: kFloat - DataType_Name of x_34: kFloat - DataType_Name of x_35: kFloat - DataType_Name of x_36: kFloat - DataType_Name of x_37: kFloat - DataType_Name of x_38: kFloat - DataType_Name of x_39: kFloat - DataType_Name of x_40: kFloat - DataType_Name of x_41: kFloat - DataType_Name of x_42: kFloat - DataType_Name of x_43: kFloat - DataType_Name of x_44: kFloat - DataType_Name of x_45: kFloat - DataType_Name of x_46: kFloat - DataType_Name of x_47: kFloat - DataType_Name of x_48: kFloat - DataType_Name of x_49: kFloat - DataType_Name of x_50: kFloat - DataType_Name of x_51: kFloat - DataType_Name of x_52: kFloat - DataType_Name of x_53: kFloat - DataType_Name of x_54: kFloat - DataType_Name of x_55: kFloat - DataType_Name of x_56: kFloat - DataType_Name of x_57: kFloat - DataType_Name of x_58: kFloat - DataType_Name of x_59: kFloat - DataType_Name of x_60: kFloat - DataType_Name of x_61: kFloat - DataType_Name of x_62: kFloat - DataType_Name of x_63: kFloat - DataType_Name of x_64: kFloat - DataType_Name of x_65: kFloat - DataType_Name of x_66: kFloat - DataType_Name of x_67: kFloat - DataType_Name of x_68: kFloat - DataType_Name of x_69: kFloat - DataType_Name of x_70: kFloat - DataType_Name of x_71: kFloat - DataType_Name of x_72: kFloat - DataType_Name of x_73: kFloat - DataType_Name of x_74: kFloat - DataType_Name of x_75: kFloat - DataType_Name of x_76: kFloat - DataType_Name of x_77: kFloat - DataType_Name of x_78: kFloat - DataType_Name of x_79: kFloat - DataType_Name of x_80: kFloat - DataType_Name of x_81: kFloat - DataType_Name of x_82: kFloat - DataType_Name of x_83: kFloat - DataType_Name of x_84: kFloat - DataType_Name of x_85: kFloat - DataType_Name of x_86: kFloat - DataType_Name of x_87: kFloat - DataType_Name of x_88: kFloat - DataType_Name of x_89: kFloat - DataType_Name of x_90: kFloat - DataType_Name of x_91: kFloat - DataType_Name of x_92: kFloat - DataType_Name of x_93: kFloat - DataType_Name of x_94: kFloat - DataType_Name of x_95: kFloat - DataType_Name of x_96: kFloat - DataType_Name of x_97: kFloat - DataType_Name of y_0: kFloat -Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name - - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf - InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs - user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) -Error Type: oneflow.ErrorProto.op_kernel_not_found_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/28 16:17:54] lb.engine.hooks INFO: Total training time: 0:00:06 (0:00:00 on hooks) -[03/28 16:26:02] libai INFO: Rank of current process: 0. World size: 1 -[03/28 16:26:02] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 16:26:03] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 16:26:03] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 16:26:03] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 16:26:03] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.309 seconds -[03/28 16:26:03] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.310 seconds -[03/28 16:26:06] lb.engine.default INFO: Prepare training, validating, testing set -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.011565 seconds -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/28 16:26:06] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.007 seconds -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 16:26:06] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 16:26:06] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/28 16:26:06] libai INFO: > Start building model... -[03/28 16:26:07] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/28 16:26:07] libai INFO: >>> done with building model. Building time: 0.790 seconds -[03/28 16:26:07] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/28 16:26:09] lb.engine.trainer INFO: Starting training from iteration 0 -[03/28 16:26:09] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/28 16:26:17] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime - self._c_nn_graph.compile_plan_for_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are - op_name: System-DynamicLossScale-MultiCountNotFinite-366 - op_type_name: multi_count_not_finite - DeviceType_Name: kMLU - DataType_Name of x_0: kFloat - DataType_Name of x_1: kFloat - DataType_Name of x_2: kFloat - DataType_Name of x_3: kFloat - DataType_Name of x_4: kFloat - DataType_Name of x_5: kFloat - DataType_Name of x_6: kFloat - DataType_Name of x_7: kFloat - DataType_Name of x_8: kFloat - DataType_Name of x_9: kFloat - DataType_Name of x_10: kFloat - DataType_Name of x_11: kFloat - DataType_Name of x_12: kFloat - DataType_Name of x_13: kFloat - DataType_Name of x_14: kFloat - DataType_Name of x_15: kFloat - DataType_Name of x_16: kFloat - DataType_Name of x_17: kFloat - DataType_Name of x_18: kFloat - DataType_Name of x_19: kFloat - DataType_Name of x_20: kFloat - DataType_Name of x_21: kFloat - DataType_Name of x_22: kFloat - DataType_Name of x_23: kFloat - DataType_Name of x_24: kFloat - DataType_Name of x_25: kFloat - DataType_Name of x_26: kFloat - DataType_Name of x_27: kFloat - DataType_Name of x_28: kFloat - DataType_Name of x_29: kFloat - DataType_Name of x_30: kFloat - DataType_Name of x_31: kFloat - DataType_Name of x_32: kFloat - DataType_Name of x_33: kFloat - DataType_Name of x_34: kFloat - DataType_Name of x_35: kFloat - DataType_Name of x_36: kFloat - DataType_Name of x_37: kFloat - DataType_Name of x_38: kFloat - DataType_Name of x_39: kFloat - DataType_Name of x_40: kFloat - DataType_Name of x_41: kFloat - DataType_Name of x_42: kFloat - DataType_Name of x_43: kFloat - DataType_Name of x_44: kFloat - DataType_Name of x_45: kFloat - DataType_Name of x_46: kFloat - DataType_Name of x_47: kFloat - DataType_Name of x_48: kFloat - DataType_Name of x_49: kFloat - DataType_Name of x_50: kFloat - DataType_Name of x_51: kFloat - DataType_Name of x_52: kFloat - DataType_Name of x_53: kFloat - DataType_Name of x_54: kFloat - DataType_Name of x_55: kFloat - DataType_Name of x_56: kFloat - DataType_Name of x_57: kFloat - DataType_Name of x_58: kFloat - DataType_Name of x_59: kFloat - DataType_Name of x_60: kFloat - DataType_Name of x_61: kFloat - DataType_Name of x_62: kFloat - DataType_Name of x_63: kFloat - DataType_Name of x_64: kFloat - DataType_Name of x_65: kFloat - DataType_Name of x_66: kFloat - DataType_Name of x_67: kFloat - DataType_Name of x_68: kFloat - DataType_Name of x_69: kFloat - DataType_Name of x_70: kFloat - DataType_Name of x_71: kFloat - DataType_Name of x_72: kFloat - DataType_Name of x_73: kFloat - DataType_Name of x_74: kFloat - DataType_Name of x_75: kFloat - DataType_Name of x_76: kFloat - DataType_Name of x_77: kFloat - DataType_Name of x_78: kFloat - DataType_Name of x_79: kFloat - DataType_Name of x_80: kFloat - DataType_Name of x_81: kFloat - DataType_Name of x_82: kFloat - DataType_Name of x_83: kFloat - DataType_Name of x_84: kFloat - DataType_Name of x_85: kFloat - DataType_Name of x_86: kFloat - DataType_Name of x_87: kFloat - DataType_Name of x_88: kFloat - DataType_Name of x_89: kFloat - DataType_Name of x_90: kFloat - DataType_Name of x_91: kFloat - DataType_Name of x_92: kFloat - DataType_Name of x_93: kFloat - DataType_Name of x_94: kFloat - DataType_Name of x_95: kFloat - DataType_Name of x_96: kFloat - DataType_Name of x_97: kFloat - DataType_Name of x_98: kFloat - DataType_Name of x_99: kFloat - DataType_Name of x_100: kFloat - DataType_Name of x_101: kFloat - DataType_Name of x_102: kFloat - DataType_Name of x_103: kFloat - DataType_Name of x_104: kFloat - DataType_Name of x_105: kFloat - DataType_Name of x_106: kFloat - DataType_Name of x_107: kFloat - DataType_Name of x_108: kFloat - DataType_Name of x_109: kFloat - DataType_Name of x_110: kFloat - DataType_Name of x_111: kFloat - DataType_Name of x_112: kFloat - DataType_Name of x_113: kFloat - DataType_Name of x_114: kFloat - DataType_Name of x_115: kFloat - DataType_Name of x_116: kFloat - DataType_Name of x_117: kFloat - DataType_Name of x_118: kFloat - DataType_Name of x_119: kFloat - DataType_Name of x_120: kFloat - DataType_Name of x_121: kFloat - DataType_Name of x_122: kFloat - DataType_Name of x_123: kFloat - DataType_Name of x_124: kFloat - DataType_Name of x_125: kFloat - DataType_Name of x_126: kFloat - DataType_Name of x_127: kFloat - DataType_Name of x_128: kFloat - DataType_Name of x_129: kFloat - DataType_Name of x_130: kFloat - DataType_Name of x_131: kFloat - DataType_Name of x_132: kFloat - DataType_Name of x_133: kFloat - DataType_Name of x_134: kFloat - DataType_Name of x_135: kFloat - DataType_Name of x_136: kFloat - DataType_Name of x_137: kFloat - DataType_Name of x_138: kFloat - DataType_Name of x_139: kFloat - DataType_Name of x_140: kFloat - DataType_Name of x_141: kFloat - DataType_Name of x_142: kFloat - DataType_Name of x_143: kFloat - DataType_Name of x_144: kFloat - DataType_Name of x_145: kFloat - DataType_Name of x_146: kFloat - DataType_Name of x_147: kFloat - DataType_Name of y_0: kInt64 -Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name - - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf - InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs - user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) -Error Type: oneflow.ErrorProto.op_kernel_not_found_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/28 16:26:17] lb.engine.hooks INFO: Total training time: 0:00:08 (0:00:00 on hooks) -[03/28 16:29:20] libai INFO: Rank of current process: 0. World size: 1 -[03/28 16:29:20] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 16:29:20] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 16:29:20] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 16:29:20] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 16:29:20] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.187 seconds -[03/28 16:29:20] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.187 seconds -[03/28 16:29:24] lb.engine.default INFO: Prepare training, validating, testing set -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.011708 seconds -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/28 16:29:24] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.009 seconds -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.002 seconds -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 16:29:24] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 16:29:24] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/28 16:29:24] libai INFO: > Start building model... -[03/28 16:29:25] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/28 16:29:25] libai INFO: >>> done with building model. Building time: 0.518 seconds -[03/28 16:29:25] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/28 16:29:27] lb.engine.trainer INFO: Starting training from iteration 0 -[03/28 16:29:27] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/28 16:29:37] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime - self._c_nn_graph.compile_plan_for_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are - op_name: System-DynamicLossScale-MultiCountNotFinite-366 - op_type_name: multi_count_not_finite - DeviceType_Name: kMLU - DataType_Name of x_0: kFloat - DataType_Name of x_1: kFloat - DataType_Name of x_2: kFloat - DataType_Name of x_3: kFloat - DataType_Name of x_4: kFloat - DataType_Name of x_5: kFloat - DataType_Name of x_6: kFloat - DataType_Name of x_7: kFloat - DataType_Name of x_8: kFloat - DataType_Name of x_9: kFloat - DataType_Name of x_10: kFloat - DataType_Name of x_11: kFloat - DataType_Name of x_12: kFloat - DataType_Name of x_13: kFloat - DataType_Name of x_14: kFloat - DataType_Name of x_15: kFloat - DataType_Name of x_16: kFloat - DataType_Name of x_17: kFloat - DataType_Name of x_18: kFloat - DataType_Name of x_19: kFloat - DataType_Name of x_20: kFloat - DataType_Name of x_21: kFloat - DataType_Name of x_22: kFloat - DataType_Name of x_23: kFloat - DataType_Name of x_24: kFloat - DataType_Name of x_25: kFloat - DataType_Name of x_26: kFloat - DataType_Name of x_27: kFloat - DataType_Name of x_28: kFloat - DataType_Name of x_29: kFloat - DataType_Name of x_30: kFloat - DataType_Name of x_31: kFloat - DataType_Name of x_32: kFloat - DataType_Name of x_33: kFloat - DataType_Name of x_34: kFloat - DataType_Name of x_35: kFloat - DataType_Name of x_36: kFloat - DataType_Name of x_37: kFloat - DataType_Name of x_38: kFloat - DataType_Name of x_39: kFloat - DataType_Name of x_40: kFloat - DataType_Name of x_41: kFloat - DataType_Name of x_42: kFloat - DataType_Name of x_43: kFloat - DataType_Name of x_44: kFloat - DataType_Name of x_45: kFloat - DataType_Name of x_46: kFloat - DataType_Name of x_47: kFloat - DataType_Name of x_48: kFloat - DataType_Name of x_49: kFloat - DataType_Name of x_50: kFloat - DataType_Name of x_51: kFloat - DataType_Name of x_52: kFloat - DataType_Name of x_53: kFloat - DataType_Name of x_54: kFloat - DataType_Name of x_55: kFloat - DataType_Name of x_56: kFloat - DataType_Name of x_57: kFloat - DataType_Name of x_58: kFloat - DataType_Name of x_59: kFloat - DataType_Name of x_60: kFloat - DataType_Name of x_61: kFloat - DataType_Name of x_62: kFloat - DataType_Name of x_63: kFloat - DataType_Name of x_64: kFloat - DataType_Name of x_65: kFloat - DataType_Name of x_66: kFloat - DataType_Name of x_67: kFloat - DataType_Name of x_68: kFloat - DataType_Name of x_69: kFloat - DataType_Name of x_70: kFloat - DataType_Name of x_71: kFloat - DataType_Name of x_72: kFloat - DataType_Name of x_73: kFloat - DataType_Name of x_74: kFloat - DataType_Name of x_75: kFloat - DataType_Name of x_76: kFloat - DataType_Name of x_77: kFloat - DataType_Name of x_78: kFloat - DataType_Name of x_79: kFloat - DataType_Name of x_80: kFloat - DataType_Name of x_81: kFloat - DataType_Name of x_82: kFloat - DataType_Name of x_83: kFloat - DataType_Name of x_84: kFloat - DataType_Name of x_85: kFloat - DataType_Name of x_86: kFloat - DataType_Name of x_87: kFloat - DataType_Name of x_88: kFloat - DataType_Name of x_89: kFloat - DataType_Name of x_90: kFloat - DataType_Name of x_91: kFloat - DataType_Name of x_92: kFloat - DataType_Name of x_93: kFloat - DataType_Name of x_94: kFloat - DataType_Name of x_95: kFloat - DataType_Name of x_96: kFloat - DataType_Name of x_97: kFloat - DataType_Name of x_98: kFloat - DataType_Name of x_99: kFloat - DataType_Name of x_100: kFloat - DataType_Name of x_101: kFloat - DataType_Name of x_102: kFloat - DataType_Name of x_103: kFloat - DataType_Name of x_104: kFloat - DataType_Name of x_105: kFloat - DataType_Name of x_106: kFloat - DataType_Name of x_107: kFloat - DataType_Name of x_108: kFloat - DataType_Name of x_109: kFloat - DataType_Name of x_110: kFloat - DataType_Name of x_111: kFloat - DataType_Name of x_112: kFloat - DataType_Name of x_113: kFloat - DataType_Name of x_114: kFloat - DataType_Name of x_115: kFloat - DataType_Name of x_116: kFloat - DataType_Name of x_117: kFloat - DataType_Name of x_118: kFloat - DataType_Name of x_119: kFloat - DataType_Name of x_120: kFloat - DataType_Name of x_121: kFloat - DataType_Name of x_122: kFloat - DataType_Name of x_123: kFloat - DataType_Name of x_124: kFloat - DataType_Name of x_125: kFloat - DataType_Name of x_126: kFloat - DataType_Name of x_127: kFloat - DataType_Name of x_128: kFloat - DataType_Name of x_129: kFloat - DataType_Name of x_130: kFloat - DataType_Name of x_131: kFloat - DataType_Name of x_132: kFloat - DataType_Name of x_133: kFloat - DataType_Name of x_134: kFloat - DataType_Name of x_135: kFloat - DataType_Name of x_136: kFloat - DataType_Name of x_137: kFloat - DataType_Name of x_138: kFloat - DataType_Name of x_139: kFloat - DataType_Name of x_140: kFloat - DataType_Name of x_141: kFloat - DataType_Name of x_142: kFloat - DataType_Name of x_143: kFloat - DataType_Name of x_144: kFloat - DataType_Name of x_145: kFloat - DataType_Name of x_146: kFloat - DataType_Name of x_147: kFloat - DataType_Name of y_0: kInt64 -Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name - - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf - InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs - user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) -Error Type: oneflow.ErrorProto.op_kernel_not_found_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/28 16:29:37] lb.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) -[03/28 16:58:19] libai INFO: Rank of current process: 0. World size: 1 -[03/28 16:58:19] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 16:58:19] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 16:58:19] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 16:58:19] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 16:58:19] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds -[03/28 16:58:19] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.056 seconds -[03/28 16:58:22] lb.engine.default INFO: Prepare training, validating, testing set -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008211 seconds -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/28 16:58:22] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 16:58:22] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 16:58:22] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/28 16:58:22] libai INFO: > Start building model... -[03/28 16:58:23] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/28 16:58:23] libai INFO: >>> done with building model. Building time: 0.891 seconds -[03/28 16:58:23] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/28 16:58:25] lb.engine.trainer INFO: Starting training from iteration 0 -[03/28 16:58:25] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/28 16:58:31] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime - self._c_nn_graph.compile_plan_for_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are - op_name: System-DynamicLossScale-MultiCountNotFinite-366 - op_type_name: multi_count_not_finite - DeviceType_Name: kMLU - DataType_Name of x_0: kFloat - DataType_Name of x_1: kFloat - DataType_Name of x_2: kFloat - DataType_Name of x_3: kFloat - DataType_Name of x_4: kFloat - DataType_Name of x_5: kFloat - DataType_Name of x_6: kFloat - DataType_Name of x_7: kFloat - DataType_Name of x_8: kFloat - DataType_Name of x_9: kFloat - DataType_Name of x_10: kFloat - DataType_Name of x_11: kFloat - DataType_Name of x_12: kFloat - DataType_Name of x_13: kFloat - DataType_Name of x_14: kFloat - DataType_Name of x_15: kFloat - DataType_Name of x_16: kFloat - DataType_Name of x_17: kFloat - DataType_Name of x_18: kFloat - DataType_Name of x_19: kFloat - DataType_Name of x_20: kFloat - DataType_Name of x_21: kFloat - DataType_Name of x_22: kFloat - DataType_Name of x_23: kFloat - DataType_Name of x_24: kFloat - DataType_Name of x_25: kFloat - DataType_Name of x_26: kFloat - DataType_Name of x_27: kFloat - DataType_Name of x_28: kFloat - DataType_Name of x_29: kFloat - DataType_Name of x_30: kFloat - DataType_Name of x_31: kFloat - DataType_Name of x_32: kFloat - DataType_Name of x_33: kFloat - DataType_Name of x_34: kFloat - DataType_Name of x_35: kFloat - DataType_Name of x_36: kFloat - DataType_Name of x_37: kFloat - DataType_Name of x_38: kFloat - DataType_Name of x_39: kFloat - DataType_Name of x_40: kFloat - DataType_Name of x_41: kFloat - DataType_Name of x_42: kFloat - DataType_Name of x_43: kFloat - DataType_Name of x_44: kFloat - DataType_Name of x_45: kFloat - DataType_Name of x_46: kFloat - DataType_Name of x_47: kFloat - DataType_Name of x_48: kFloat - DataType_Name of x_49: kFloat - DataType_Name of x_50: kFloat - DataType_Name of x_51: kFloat - DataType_Name of x_52: kFloat - DataType_Name of x_53: kFloat - DataType_Name of x_54: kFloat - DataType_Name of x_55: kFloat - DataType_Name of x_56: kFloat - DataType_Name of x_57: kFloat - DataType_Name of x_58: kFloat - DataType_Name of x_59: kFloat - DataType_Name of x_60: kFloat - DataType_Name of x_61: kFloat - DataType_Name of x_62: kFloat - DataType_Name of x_63: kFloat - DataType_Name of x_64: kFloat - DataType_Name of x_65: kFloat - DataType_Name of x_66: kFloat - DataType_Name of x_67: kFloat - DataType_Name of x_68: kFloat - DataType_Name of x_69: kFloat - DataType_Name of x_70: kFloat - DataType_Name of x_71: kFloat - DataType_Name of x_72: kFloat - DataType_Name of x_73: kFloat - DataType_Name of x_74: kFloat - DataType_Name of x_75: kFloat - DataType_Name of x_76: kFloat - DataType_Name of x_77: kFloat - DataType_Name of x_78: kFloat - DataType_Name of x_79: kFloat - DataType_Name of x_80: kFloat - DataType_Name of x_81: kFloat - DataType_Name of x_82: kFloat - DataType_Name of x_83: kFloat - DataType_Name of x_84: kFloat - DataType_Name of x_85: kFloat - DataType_Name of x_86: kFloat - DataType_Name of x_87: kFloat - DataType_Name of x_88: kFloat - DataType_Name of x_89: kFloat - DataType_Name of x_90: kFloat - DataType_Name of x_91: kFloat - DataType_Name of x_92: kFloat - DataType_Name of x_93: kFloat - DataType_Name of x_94: kFloat - DataType_Name of x_95: kFloat - DataType_Name of x_96: kFloat - DataType_Name of x_97: kFloat - DataType_Name of x_98: kFloat - DataType_Name of x_99: kFloat - DataType_Name of x_100: kFloat - DataType_Name of x_101: kFloat - DataType_Name of x_102: kFloat - DataType_Name of x_103: kFloat - DataType_Name of x_104: kFloat - DataType_Name of x_105: kFloat - DataType_Name of x_106: kFloat - DataType_Name of x_107: kFloat - DataType_Name of x_108: kFloat - DataType_Name of x_109: kFloat - DataType_Name of x_110: kFloat - DataType_Name of x_111: kFloat - DataType_Name of x_112: kFloat - DataType_Name of x_113: kFloat - DataType_Name of x_114: kFloat - DataType_Name of x_115: kFloat - DataType_Name of x_116: kFloat - DataType_Name of x_117: kFloat - DataType_Name of x_118: kFloat - DataType_Name of x_119: kFloat - DataType_Name of x_120: kFloat - DataType_Name of x_121: kFloat - DataType_Name of x_122: kFloat - DataType_Name of x_123: kFloat - DataType_Name of x_124: kFloat - DataType_Name of x_125: kFloat - DataType_Name of x_126: kFloat - DataType_Name of x_127: kFloat - DataType_Name of x_128: kFloat - DataType_Name of x_129: kFloat - DataType_Name of x_130: kFloat - DataType_Name of x_131: kFloat - DataType_Name of x_132: kFloat - DataType_Name of x_133: kFloat - DataType_Name of x_134: kFloat - DataType_Name of x_135: kFloat - DataType_Name of x_136: kFloat - DataType_Name of x_137: kFloat - DataType_Name of x_138: kFloat - DataType_Name of x_139: kFloat - DataType_Name of x_140: kFloat - DataType_Name of x_141: kFloat - DataType_Name of x_142: kFloat - DataType_Name of x_143: kFloat - DataType_Name of x_144: kFloat - DataType_Name of x_145: kFloat - DataType_Name of x_146: kFloat - DataType_Name of x_147: kFloat - DataType_Name of y_0: kInt64 -Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name - - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf - InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs - user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) -Error Type: oneflow.ErrorProto.op_kernel_not_found_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/28 16:58:31] lb.engine.hooks INFO: Total training time: 0:00:06 (0:00:00 on hooks) -[03/28 17:05:44] libai INFO: Rank of current process: 0. World size: 1 -[03/28 17:05:44] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/28 17:05:44] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=50, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=2 * model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/28 17:05:44] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/28 17:05:44] lb.engine.default INFO: > compiling dataset index builder ... -[03/28 17:05:44] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.043 seconds -[03/28 17:05:44] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.044 seconds -[03/28 17:05:46] lb.engine.default INFO: Prepare training, validating, testing set -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007315 seconds -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/28 17:05:46] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/28 17:05:46] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/28 17:05:47] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/28 17:05:47] libai INFO: > Start building model... -[03/28 17:05:47] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/28 17:05:47] libai INFO: >>> done with building model. Building time: 0.294 seconds -[03/28 17:05:47] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/28 17:05:48] lb.engine.trainer INFO: Starting training from iteration 0 -[03/28 17:05:48] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/28 17:05:54] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1165, in finish_compile_and_init_runtime - self._c_nn_graph.compile_plan_for_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are - op_name: System-DynamicLossScale-MultiCountNotFinite-366 - op_type_name: multi_count_not_finite - DeviceType_Name: kMLU - DataType_Name of x_0: kFloat - DataType_Name of x_1: kFloat - DataType_Name of x_2: kFloat - DataType_Name of x_3: kFloat - DataType_Name of x_4: kFloat - DataType_Name of x_5: kFloat - DataType_Name of x_6: kFloat - DataType_Name of x_7: kFloat - DataType_Name of x_8: kFloat - DataType_Name of x_9: kFloat - DataType_Name of x_10: kFloat - DataType_Name of x_11: kFloat - DataType_Name of x_12: kFloat - DataType_Name of x_13: kFloat - DataType_Name of x_14: kFloat - DataType_Name of x_15: kFloat - DataType_Name of x_16: kFloat - DataType_Name of x_17: kFloat - DataType_Name of x_18: kFloat - DataType_Name of x_19: kFloat - DataType_Name of x_20: kFloat - DataType_Name of x_21: kFloat - DataType_Name of x_22: kFloat - DataType_Name of x_23: kFloat - DataType_Name of x_24: kFloat - DataType_Name of x_25: kFloat - DataType_Name of x_26: kFloat - DataType_Name of x_27: kFloat - DataType_Name of x_28: kFloat - DataType_Name of x_29: kFloat - DataType_Name of x_30: kFloat - DataType_Name of x_31: kFloat - DataType_Name of x_32: kFloat - DataType_Name of x_33: kFloat - DataType_Name of x_34: kFloat - DataType_Name of x_35: kFloat - DataType_Name of x_36: kFloat - DataType_Name of x_37: kFloat - DataType_Name of x_38: kFloat - DataType_Name of x_39: kFloat - DataType_Name of x_40: kFloat - DataType_Name of x_41: kFloat - DataType_Name of x_42: kFloat - DataType_Name of x_43: kFloat - DataType_Name of x_44: kFloat - DataType_Name of x_45: kFloat - DataType_Name of x_46: kFloat - DataType_Name of x_47: kFloat - DataType_Name of x_48: kFloat - DataType_Name of x_49: kFloat - DataType_Name of x_50: kFloat - DataType_Name of x_51: kFloat - DataType_Name of x_52: kFloat - DataType_Name of x_53: kFloat - DataType_Name of x_54: kFloat - DataType_Name of x_55: kFloat - DataType_Name of x_56: kFloat - DataType_Name of x_57: kFloat - DataType_Name of x_58: kFloat - DataType_Name of x_59: kFloat - DataType_Name of x_60: kFloat - DataType_Name of x_61: kFloat - DataType_Name of x_62: kFloat - DataType_Name of x_63: kFloat - DataType_Name of x_64: kFloat - DataType_Name of x_65: kFloat - DataType_Name of x_66: kFloat - DataType_Name of x_67: kFloat - DataType_Name of x_68: kFloat - DataType_Name of x_69: kFloat - DataType_Name of x_70: kFloat - DataType_Name of x_71: kFloat - DataType_Name of x_72: kFloat - DataType_Name of x_73: kFloat - DataType_Name of x_74: kFloat - DataType_Name of x_75: kFloat - DataType_Name of x_76: kFloat - DataType_Name of x_77: kFloat - DataType_Name of x_78: kFloat - DataType_Name of x_79: kFloat - DataType_Name of x_80: kFloat - DataType_Name of x_81: kFloat - DataType_Name of x_82: kFloat - DataType_Name of x_83: kFloat - DataType_Name of x_84: kFloat - DataType_Name of x_85: kFloat - DataType_Name of x_86: kFloat - DataType_Name of x_87: kFloat - DataType_Name of x_88: kFloat - DataType_Name of x_89: kFloat - DataType_Name of x_90: kFloat - DataType_Name of x_91: kFloat - DataType_Name of x_92: kFloat - DataType_Name of x_93: kFloat - DataType_Name of x_94: kFloat - DataType_Name of x_95: kFloat - DataType_Name of x_96: kFloat - DataType_Name of x_97: kFloat - DataType_Name of x_98: kFloat - DataType_Name of x_99: kFloat - DataType_Name of x_100: kFloat - DataType_Name of x_101: kFloat - DataType_Name of x_102: kFloat - DataType_Name of x_103: kFloat - DataType_Name of x_104: kFloat - DataType_Name of x_105: kFloat - DataType_Name of x_106: kFloat - DataType_Name of x_107: kFloat - DataType_Name of x_108: kFloat - DataType_Name of x_109: kFloat - DataType_Name of x_110: kFloat - DataType_Name of x_111: kFloat - DataType_Name of x_112: kFloat - DataType_Name of x_113: kFloat - DataType_Name of x_114: kFloat - DataType_Name of x_115: kFloat - DataType_Name of x_116: kFloat - DataType_Name of x_117: kFloat - DataType_Name of x_118: kFloat - DataType_Name of x_119: kFloat - DataType_Name of x_120: kFloat - DataType_Name of x_121: kFloat - DataType_Name of x_122: kFloat - DataType_Name of x_123: kFloat - DataType_Name of x_124: kFloat - DataType_Name of x_125: kFloat - DataType_Name of x_126: kFloat - DataType_Name of x_127: kFloat - DataType_Name of x_128: kFloat - DataType_Name of x_129: kFloat - DataType_Name of x_130: kFloat - DataType_Name of x_131: kFloat - DataType_Name of x_132: kFloat - DataType_Name of x_133: kFloat - DataType_Name of x_134: kFloat - DataType_Name of x_135: kFloat - DataType_Name of x_136: kFloat - DataType_Name of x_137: kFloat - DataType_Name of x_138: kFloat - DataType_Name of x_139: kFloat - DataType_Name of x_140: kFloat - DataType_Name of x_141: kFloat - DataType_Name of x_142: kFloat - DataType_Name of x_143: kFloat - DataType_Name of x_144: kFloat - DataType_Name of x_145: kFloat - DataType_Name of x_146: kFloat - DataType_Name of x_147: kFloat - DataType_Name of y_0: kInt64 -Error message from /home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp:120 - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()): infer blob descs if failed, op name - - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in InferBlobDescs - op_->InferBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, &GlobalJobDesc()) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/operator.cpp", line 351, in InferBlobDescsIf - InferInternalBlobDescsIf(GetBlobDesc4BnInOp, parallel_ctx, job_desc) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/operator/user_op.cpp", line 711, in InferInternalBlobDescs - user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult( op_conf().user_conf().op_type_name(), UserOpKernelRegContext(this, GetBlobDesc4BnInOp, parallel_ctx)) -Error Type: oneflow.ErrorProto.op_kernel_not_found_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/graph/exec_graph.cpp", line 120, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/28 17:05:54] lb.engine.hooks INFO: Total training time: 0:00:06 (0:00:00 on hooks) diff --git a/projects/MagicPrompt/oneflow_magicprompt/metrics.json b/projects/MagicPrompt/oneflow_magicprompt/metrics.json deleted file mode 100644 index e69de29bb..000000000 From 811584aa7fac71128dca18b1c6725916f4264af2 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Wed, 29 Mar 2023 16:28:24 +0800 Subject: [PATCH 08/25] refine script --- configs/common/optim.py | 4 ++-- libai/version.py | 2 +- projects/MagicPrompt/configs/gpt2_training.py | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/configs/common/optim.py b/configs/common/optim.py index 7cd5f2cf0..8e062f2bf 100644 --- a/configs/common/optim.py +++ b/configs/common/optim.py @@ -7,8 +7,8 @@ params=LazyCall(get_default_optimizer_params)( # params.model is meant to be set to the model object, # before instantiating the optimizer. - clip_grad_max_norm=1.0, - clip_grad_norm_type=2.0, + # clip_grad_max_norm=1.0, + # clip_grad_norm_type=2.0, weight_decay_norm=0.0, weight_decay_bias=0.0, ), diff --git a/libai/version.py b/libai/version.py index 1b36290b1..e7d545026 100644 --- a/libai/version.py +++ b/libai/version.py @@ -1,2 +1,2 @@ __version__ = '0.2.0' -git_version = 'ec311a739120c1f93edf0dcc86627f61e84bcb02' +git_version = 'd47fe9c7908c1e7a4604574b2bd58a7e06d20645' diff --git a/projects/MagicPrompt/configs/gpt2_training.py b/projects/MagicPrompt/configs/gpt2_training.py index 4f7d5a540..a40bf13ca 100644 --- a/projects/MagicPrompt/configs/gpt2_training.py +++ b/projects/MagicPrompt/configs/gpt2_training.py @@ -9,6 +9,7 @@ from configs.common.train import train from configs.common.models.graph import graph +graph.enabled=False vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" @@ -58,8 +59,8 @@ test_micro_batch_size=4, train_epoch=33, train_iter=10000, - log_period=50, - amp=dict(enabled=True), + log_period=1, + amp=dict(enabled=False), warmup_ratio=0, checkpointer=dict(period=8000, max_to_keep=20), dist=dict( From 44520ad17b3e6a593ba4b501f7c4ff23eebb56e2 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Thu, 30 Mar 2023 11:13:57 +0800 Subject: [PATCH 09/25] refine --- libai/layers/embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libai/layers/embedding.py b/libai/layers/embedding.py index 06873fa44..f0a5cafb3 100644 --- a/libai/layers/embedding.py +++ b/libai/layers/embedding.py @@ -71,8 +71,8 @@ def __init__( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), ) ) - # if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1": - # self.init_method(self.weight) + if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1": + self.init_method(self.weight) # FIXME(lxy): Fill padding_idx is not supported in nd_sbp right now. # self._fill_padding_idx_with_zero() From b7a8f3e616f77b472c6a4e00453d0e8853697057 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Fri, 31 Mar 2023 15:01:58 +0800 Subject: [PATCH 10/25] refine --- libai/inference/generator/generation_utils.py | 2 +- projects/MagicPrompt/configs/gpt2_training.py | 5 +- .../oneflow_magicprompt/config.yaml | 76 + .../events.out.tfevents.1680146933.oneflow-32 | Bin 0 -> 3920 bytes .../events.out.tfevents.1680147471.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1680148160.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1680148191.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1680162058.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1680163617.oneflow-32 | 0 .../events.out.tfevents.1680163809.oneflow-32 | 0 .../events.out.tfevents.1680163941.oneflow-32 | 0 .../events.out.tfevents.1680168494.oneflow-32 | 0 .../events.out.tfevents.1680168780.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1680168820.oneflow-32 | Bin 0 -> 3920 bytes .../events.out.tfevents.1680171506.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1680244302.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1680244852.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1680245650.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1680245838.oneflow-32 | 0 .../events.out.tfevents.1680245977.oneflow-32 | 0 .../MagicPrompt/oneflow_magicprompt/log.txt | 6904 +++++++++++++++++ .../oneflow_magicprompt/log.txt.rank1 | 2084 +++++ .../oneflow_magicprompt/log.txt.rank2 | 2084 +++++ .../oneflow_magicprompt/log.txt.rank3 | 2084 +++++ .../oneflow_magicprompt/metrics.json | 65 + 25 files changed, 13302 insertions(+), 2 deletions(-) create mode 100644 projects/MagicPrompt/oneflow_magicprompt/config.yaml create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680146933.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680147471.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148160.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148191.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680162058.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163617.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163809.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163941.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168494.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168780.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168820.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680171506.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680244302.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680244852.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245650.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245838.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245977.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt create mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt.rank1 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt.rank2 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt.rank3 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/metrics.json diff --git a/libai/inference/generator/generation_utils.py b/libai/inference/generator/generation_utils.py index 14b8e3dee..38563bbef 100644 --- a/libai/inference/generator/generation_utils.py +++ b/libai/inference/generator/generation_utils.py @@ -493,7 +493,7 @@ def greedy_search( scores += (next_token_scores,) # argmax - next_tokens = flow.argmax(next_token_scores.to("cpu"), dim=-1).to("mlu") + next_tokens = flow.argmax(next_token_scores, dim=-1) next_tokens = next_tokens.to_global(placement=input_ids.placement) unfinished_sequences = unfinished_sequences.to_global( sbp=next_tokens.sbp, placement=next_tokens.placement diff --git a/projects/MagicPrompt/configs/gpt2_training.py b/projects/MagicPrompt/configs/gpt2_training.py index b5182e6c2..7b4eb3c11 100644 --- a/projects/MagicPrompt/configs/gpt2_training.py +++ b/projects/MagicPrompt/configs/gpt2_training.py @@ -10,6 +10,7 @@ from configs.common.models.graph import graph graph.enabled=False +graph.debug = 2 vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" @@ -64,9 +65,11 @@ warmup_ratio=0, checkpointer=dict(period=8000, max_to_keep=20), dist=dict( - data_parallel_size=1, + data_parallel_size=4, tensor_parallel_size=1, pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, # pipeline_num_layers=model.cfg.hidden_layers, ), scheduler=LazyCall(WarmupExponentialLR)( diff --git a/projects/MagicPrompt/oneflow_magicprompt/config.yaml b/projects/MagicPrompt/oneflow_magicprompt/config.yaml new file mode 100644 index 000000000..61b473e5c --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/config.yaml @@ -0,0 +1,76 @@ +dataloader: + train: + _target_: libai.data.build_nlp_train_val_test_loader + dataset: + - _target_: libai.data.datasets.GPT2Dataset + data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence + indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} + max_seq_length: 1024 + name: gpt-2 + seed: 1234 + num_workers: 4 + splits: + - [949.0, 50.0, 1.0] + train_val_test_num_samples: null + weights: [1.0] +ds: + _target_: libai.data.datasets.GPT2Dataset + data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence + indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} + max_seq_length: 1024 + name: gpt-2 + seed: 1234 +graph: + auto_parallel: {enable_auto_parallel_ignore_user_sbp_config: false, enabled: false, sbp_collector: false, trunk_algo: true} + debug: 2 + enabled: false + eval_graph: {_target_: libai.models.utils.GraphBase, is_train: false} + train_graph: {_target_: libai.models.utils.GraphBase, is_train: true} +model: + _target_: projects.MagicPrompt.gpt2.GPTForPreTraining + cfg: {amp_enabled: false, apply_query_key_layer_scaling: true, apply_residual_post_layernorm: false, attention_dropout_prob: 0.1, bias_dropout_fusion: false, bias_gelu_fusion: false, bos_token_id: 50256, chunk_size_feed_forward: 0, decoder_start_token_id: null, diversity_penalty: 0.0, do_sample: false, early_stopping: false, embedding_dropout_prob: 0.1, encoder_no_repeat_ngram_size: 0, eos_token_id: 50256, exponential_decay_length_penalty: null, ffn_hidden_size: 3072, forced_bos_token_id: null, forced_eos_token_id: null, hidden_layers: 12, hidden_size: 768, initializer_range: 0.02, is_encoder_decoder: false, layernorm_epsilon: 1.0e-05, length_penalty: 1.0, max_length: 20, max_seq_length: 1024, min_length: 0, no_repeat_ngram_size: 0, num_attention_heads: 12, num_beam_groups: 1, num_beams: 1, num_return_sequences: 1, output_dropout_prob: 0.1, output_scores: false, pad_token_id: 0, pretrained_model_path: null, remove_invalid_values: false, repetition_penalty: 1.0, scale_mask_softmax_fusion: false, sep_token_id: null, temperature: 1.0, top_k: 50, top_p: 1.0, typical_p: 1.0, use_cache: true, use_scaled_init_for_output_weights: true, vocab_size: 50257} +optim: + _target_: oneflow.nn.optimizer.adamw.AdamW + betas: [0.9, 0.999] + do_bias_correction: true + eps: 1.0e-08 + lr: 5.0e-05 + params: {_target_: libai.optim.get_default_optimizer_params, clip_grad_max_norm: null, clip_grad_norm_type: null, weight_decay_bias: 0.0, weight_decay_norm: 0.0} + weight_decay: 0.01 +tokenization: + append_eod: false + make_vocab_size_divisible_by: 128 + tokenizer: {_target_: libai.tokenizer.GPT2Tokenizer, do_chinese_wwm: true, do_lower_case: true, merges_file: /home/zhangxiaoyu/magicprompt/merges.txt, vocab_file: /home/zhangxiaoyu/magicprompt/vocab.json} +train: + activation_checkpoint: {enabled: false} + amp: {enabled: false} + checkpointer: {max_to_keep: 20, period: 8000} + consumed_train_samples: 0 + consumed_valid_samples: 0 + dist: {data_parallel_size: 4, num_gpus_per_node: 4, num_nodes: 1, pipeline_num_layers: 10000, pipeline_parallel_size: 1, tensor_parallel_size: 1} + evaluation: + enabled: true + eval_iter: 250 + eval_period: 4000 + evaluator: {_target_: libai.evaluation.PPLEvaluator} + global_batch_size: 16 + input_placement_device: cpu + load_weight: '' + log_period: 1 + nccl_fusion_max_ops: 24 + nccl_fusion_threshold_mb: 16 + num_accumulation_steps: 1 + output_dir: projects/MagicPrompt/oneflow_magicprompt + rdma_enabled: false + resume: false + samples: 160000 + scheduler: {_target_: libai.scheduler.WarmupExponentialLR, gamma: 1.0, warmup_factor: 0.0, warmup_iter: 0.0, warmup_method: linear} + seed: 1234 + start_iter: 0 + test_micro_batch_size: 4 + train_epoch: 33 + train_iter: 10000 + train_micro_batch_size: 4 + train_samples: null + warmup_ratio: 0 + zero_optimization: {enabled: false, stage: 1} diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680146933.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680146933.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..6481a1dc7430a67b9cdba76a73d0266709fcdf89 GIT binary patch literal 3920 zcmZYAdr(w$6u@z1LArN&>@M;U4M8!~*fet#FEhvw(6pQ2QjISVm!Odm*$_0vFu{}* zNmNu!KooO?MkQtdGZ6$u1eP}iiVFBh!$-;j`JC9>&hK~6@7}+^bLM>RnceF&;{PY_ zJAMM=vus(SFVl&ft;5%8qhdaow;?hno|~b}j?R78)UBF!dm__ZR=UmDO;xc9mWj~D zYs2EB)<)WIx_5ya@br7(Vv0MvR5O?{)uz~vRkCcnE?yfQ7OjhmvroLNMpu6tighs6 zLE4LxnZ8C-EoT|lEP9>2W2wd~RG&I$ys6Ug0ZxoZ`P5y;1r#Ic6?)HZ;xcQdZ*eu{ zQy%9pX7VrNoL-`Si5uFQ)7s=@>V@~P;0o@=Yq*7OaR+NO2Odoq)-a>j2*^0a83H^4 z!5YgoIH%W`%Q~X|n{|a?jV{uQ*QodGH8`8rRUPS2Gh8s9Yt718bKw?DS_4W|RE=^o zl_NJl8f5m6ax|-A<&ndei;MhH7ZVX@|L~=ZUmjx^0Kz}Ih7Jls87*{AT+5AUTJH#i zdv4*J7M^)&kDNVjv}_`DqFw2)6!qQk#b)B<5f1@6Vd^6f$GK=_JuM40wk zk7l(5tATLoB|K3gbjXqMlaf_MK-kmbQ7Q=Ep@qg9?hdY834~c~IH!gGj8&o?o+W<) zp>sR##lk+X2j2i;0_lj*c*6-FwygrfR#HlY>fsyO=u{Jc&@3yZOb{}(&?!^RsmA$^ z1H!l2IH!fObGB&3hSk16n3jWku~6O?`T_{ekK&F9zc{T&Dg9X`Kp0C(i7;S|jDOl5 z;SYrS`}(Q`AxjHmS8BMl;{_o=82A|Hw9w&m8x*o8{0R_VCB0ZUHs_Wn5V}9XbBVAq zNsq=)^MYTCL{dtGD}rQv)!vmQK={3FXssZW)53;>L0sp|nQlOMU4e62=yuW?9eHS5 z0fd1{+>3?B?W2AL!XnZUq49=$-HYnL4XsspqC}`1zM-c3-U%SA+gaBn2(4)0`;CjZ zShi&q5N=GxIW6@3*9tZDcDnAH9cD18gB!H1*DV+JHD0i z89CmEfN-wa{67VuH7(4{_2)Fls=|S=Rfcm~IFxIJe)Ft#0m49Y+>3<;woAi-u$Xj2 z=rH^}w0O{P00>6A2yJQM zn@WFflHL6bAiP+Eb6S{FEk~}F^_4(4=Nj(C!aE`7eSq))>4?yH!#5kl(t+?1DJ8<> z;TvXuweJEDzS*+>Z$YS_g=1bX;EpN|M*-p0tvIKJ3I1|4-jM4JgaJvo7Yh$v%FhDA zW27TO;|*=YKZ3u*7o?O3!-j8Yc-5f;!h~pxXM#{k3k!e$nA?`2xB`S;F*v7%ZH{vE uET~ukgoUIR3)%ZMn}JZF!*hvn^8-CfJ^ynjxZ!G2N`!vtGJe3va?XF{W>y;j literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680147471.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680147471.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..64998e40b1cffeec73952b8f227b8d6d72674b43 GIT binary patch literal 40 rcmb1OfPlsI-b$RMbyt`jId3{j@g@}|X6EU+mZj#ESQ-6VRAmbQznlyK literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148160.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148160.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..d00c6968ba3861f9106150eba8d469512c3f3d21 GIT binary patch literal 40 rcmb1OfPlsI-b$Qz5=A#Sa^7^5;!P?_%*@ksElbTSu`-gI!L%O$!5j-+ literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148191.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148191.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..2d3ebe79e641292fff8da172674c944f167e34e5 GIT binary patch literal 40 rcmb1OfPlsI-b$Q$t3PjdNT9%quVr6vZr=<=6&{7PM literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680162058.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680162058.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..2c464c8d1a6dc4caafc59022090d1b19e7ce37a3 GIT binary patch literal 40 rcmb1OfPlsI-b$RyH#9r>bKZ26;!P?_%*@ksElbTSu`&u~_KN@j#DNP6 literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163617.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163617.oneflow-32 new file mode 100644 index 000000000..e69de29bb diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163809.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163809.oneflow-32 new file mode 100644 index 000000000..e69de29bb diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163941.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163941.oneflow-32 new file mode 100644 index 000000000..e69de29bb diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168494.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168494.oneflow-32 new file mode 100644 index 000000000..e69de29bb diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168780.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168780.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..e844923f49823098b8e420364a6a4438b40e938f GIT binary patch literal 40 rcmb1OfPlsI-b$P|AE{jq<-F-A#hX-=n3<>NT9%quVr3M2(sCXE)_x4B literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168820.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168820.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..8ed63e9be45058d52f5cd47425c9b11c88dfae62 GIT binary patch literal 3920 zcmZYBc~BE~7{+lf#ViekXw))dAzt;0cr4XbG|PZO5CIjZ(j!F0kroBy(xP@c6&=M3 zacEI$jX)LfsH0U&Jvp?(YOPx3up$-#t5_&S!3o;zHhp*B-}3i&GVdo(Hes;ce}DO( zhiDc4miK;-RMfH_$|b8!v8!}}sWGdPnMs32{}AeF>!K$#3{%wqT>gOX#;7S3rC4T4 zHW`y+SH@^2GJBbSvX|`{YKv>{t0yV;)Y#(wlsiQw$0wWOjB)WvNt&^TNE3!6!^pHP5Wlx?^-n)yr9hA4( z%a)IBD+*VXXF^UMk(W-hB~26m9&GD{_u=q%=%r^wdz_`v8T7=LjlvlYI*q}MphurM%~@|H98}68uZdL+`G(L58JuweKq41hi5>iq+BQ`VT-?xbNc)4 zJHoterG5Id7mDMvpd3J{DVG@Qa!D~u<5w+9(&R0pkUl}P8`h>KBjMuN&`X84d@4^Mp=Azq zqR_9F)4TTgZAHS|U?^q6+%I(o?wh(yBz#z0(JTo2kiyrO4=@|DPvj!u-6N2bLjMe! z&D@gp2nidGLN68OojW-h3Atm?i9&wExpBw(BH@tlp_B<*5_JaOM7x(r=yLVrRzcX8 z6!IIYOOHDsp`i+LQs^E_vjrpj1tVceHS|(pd-sTTBs|GMCkpuuw?7=|jf8iuLn#x+ zSU0>8Wn6@W)u+3k2tsF4$UlebhC}!Z&+$Cuq_8Q0X8qjEiAdOU5qhccyrroN2|df8 z6NT-+a{6MWrW*C-WDHFC@ zpTozWIZi{uO|_NJ1tCQWtAY{P!h2vw5g%ihxT*` z!na6a=Yea?%&f|EESv{9DV(>MX494yEJwl@VbDv3Y{sAXQ)-HUP89MRK5lMZgM`NU zP|Ae$aXLe&`L+fL^VsZ8L8v5!{DwU{!tryc3xJ#yhAp63yO2CLB%Bcly;SJ6vCl5F zp>{TOqLAM(&AHPK2^+*xCakn>n2_RDfrQ?cab1GYl@#*N;nGcm_M;6;UPDd_!$WCy z`KhF%NO<}U^ituv(zHWJxXEt(|7VE853g|g(*7y<7k|DzlrrH)>wj&Z`0mX?!jb#0 zyby$Lq_DUnpNVWa<%Wa~X2?k)zu~9)w;htZpxvj)bqpP82>Xrx&6P z>+|6iWx{pwI>UL5vJDBFa-+Kip^6kbM4Fi~4KJS}VbZsdlR|#O*1&)$B>c?`y;PX4 ze0~`T3-X~8h5Uvsy;=?lQwpJ!3B#=$8Zz4^AfeN!S3QEzofIw)D`0jmm@^j%zZwlW zDZCUyvo&o&2a#~tSm>p~62`>IBz;i@g@}|X6EU+mZj#ESQ!b)YNT9%quVr68zA!sfD$GHpn literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245650.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245650.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..247d83caee7d86f8e4ec5bdbdd1dafb243af61c3 GIT binary patch literal 40 rcmb1OfPlsI-b$R}g)5#c;JoQ5#hX-=n3<>NT9%quVrBGu_ttX&&kGHZ literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245838.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245838.oneflow-32 new file mode 100644 index 000000000..e69de29bb diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245977.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245977.oneflow-32 new file mode 100644 index 000000000..e69de29bb diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt b/projects/MagicPrompt/oneflow_magicprompt/log.txt new file mode 100644 index 000000000..6d6ef73f9 --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/log.txt @@ -0,0 +1,6904 @@ +[03/30 11:28:49] libai INFO: Rank of current process: 0. World size: 1 +[03/30 11:28:49] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 11:28:49] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 11:28:49] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 11:28:49] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 11:28:50] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.054 seconds +[03/30 11:28:50] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.055 seconds +[03/30 11:28:52] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008338 seconds +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 11:28:52] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/30 11:28:52] libai INFO: > Start building model... +[03/30 11:28:52] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 11:28:52] libai INFO: >>> done with building model. Building time: 0.145 seconds +[03/30 11:28:52] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 11:28:53] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 11:29:18] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0014 s/iter lr: 5.00e-05 +[03/30 11:29:44] lb.utils.events INFO: eta: 100 days, 0:25:31 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0010 s/iter lr: 5.00e-05 +[03/30 11:30:10] lb.utils.events INFO: eta: 100 days, 2:37:28 iteration: 2/335008 consumed_samples: 12 total_loss: 9.44 time: 25.8189 s/iter data_time: 0.0011 s/iter total_throughput: 0.15 samples/s lr: 5.00e-05 +[03/30 11:30:35] lb.utils.events INFO: eta: 99 days, 6:11:35 iteration: 3/335008 consumed_samples: 16 total_loss: 9.374 time: 25.5994 s/iter data_time: 0.0009 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:31:01] lb.utils.events INFO: eta: 99 days, 21:40:02 iteration: 4/335008 consumed_samples: 20 total_loss: 9.307 time: 25.6548 s/iter data_time: 0.0010 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:31:27] lb.utils.events INFO: eta: 99 days, 18:03:38 iteration: 5/335008 consumed_samples: 24 total_loss: 9.212 time: 25.6632 s/iter data_time: 0.0009 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:31:53] lb.utils.events INFO: eta: 99 days, 18:24:32 iteration: 6/335008 consumed_samples: 28 total_loss: 9.117 time: 25.6768 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:32:18] lb.utils.events INFO: eta: 99 days, 20:01:26 iteration: 7/335008 consumed_samples: 32 total_loss: 9.093 time: 25.7183 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:32:44] lb.utils.events INFO: eta: 99 days, 18:23:41 iteration: 8/335008 consumed_samples: 36 total_loss: 9.069 time: 25.7046 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:33:10] lb.utils.events INFO: eta: 99 days, 16:24:36 iteration: 9/335008 consumed_samples: 40 total_loss: 8.982 time: 25.6782 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:33:35] lb.utils.events INFO: eta: 99 days, 14:25:31 iteration: 10/335008 consumed_samples: 44 total_loss: 8.896 time: 25.6656 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:34:01] lb.utils.events INFO: eta: 99 days, 16:23:44 iteration: 11/335008 consumed_samples: 48 total_loss: 8.853 time: 25.6755 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:34:27] lb.utils.events INFO: eta: 99 days, 17:39:35 iteration: 12/335008 consumed_samples: 52 total_loss: 8.811 time: 25.6799 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:34:53] lb.utils.events INFO: eta: 99 days, 18:00:21 iteration: 13/335008 consumed_samples: 56 total_loss: 8.789 time: 25.6990 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:35:18] lb.utils.events INFO: eta: 99 days, 18:21:06 iteration: 14/335008 consumed_samples: 60 total_loss: 8.768 time: 25.7027 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:35:44] lb.utils.events INFO: eta: 99 days, 17:59:29 iteration: 15/335008 consumed_samples: 64 total_loss: 8.736 time: 25.7000 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:36:10] lb.utils.events INFO: eta: 99 days, 18:20:15 iteration: 16/335008 consumed_samples: 68 total_loss: 8.704 time: 25.7059 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:36:36] lb.utils.events INFO: eta: 99 days, 19:02:25 iteration: 17/335008 consumed_samples: 72 total_loss: 8.693 time: 25.7103 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 11:37:47] libai INFO: Rank of current process: 0. World size: 1 +[03/30 11:37:47] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 11:37:47] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 11:37:47] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 11:37:47] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 11:37:47] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.053 seconds +[03/30 11:37:47] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.054 seconds +[03/30 11:37:49] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007539 seconds +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 11:37:50] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/30 11:37:50] libai INFO: > Start building model... +[03/30 11:37:50] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 11:37:50] libai INFO: >>> done with building model. Building time: 0.155 seconds +[03/30 11:37:50] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 11:37:51] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 11:37:51] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 11:37:56] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + +[03/30 11:37:56] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) +[03/30 11:49:16] libai INFO: Rank of current process: 0. World size: 1 +[03/30 11:49:16] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 11:49:16] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug=0 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 11:49:16] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 11:49:16] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 11:49:16] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.045 seconds +[03/30 11:49:16] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.046 seconds +[03/30 11:49:18] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008186 seconds +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 11:49:19] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/30 11:49:19] libai INFO: > Start building model... +[03/30 11:49:19] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 11:49:19] libai INFO: >>> done with building model. Building time: 0.157 seconds +[03/30 11:49:19] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 11:49:19] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 11:49:19] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 11:49:20] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 11:49:20] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 11:49:25] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + +[03/30 11:49:25] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) +[03/30 11:49:48] libai INFO: Rank of current process: 0. World size: 1 +[03/30 11:49:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 11:49:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug=1 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 11:49:48] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 11:49:48] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 11:49:48] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.044 seconds +[03/30 11:49:48] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.045 seconds +[03/30 11:49:50] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006787 seconds +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.003 seconds +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 11:49:50] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/30 11:49:50] libai INFO: > Start building model... +[03/30 11:49:50] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 11:49:50] libai INFO: >>> done with building model. Building time: 0.154 seconds +[03/30 11:49:50] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 11:49:50] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 11:49:50] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 11:49:51] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 11:49:51] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 11:49:56] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + +[03/30 11:49:56] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) +[03/30 15:40:54] libai INFO: Rank of current process: 0. World size: 1 +[03/30 15:40:54] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 15:40:54] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug=1 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 15:40:54] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 15:40:54] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 15:40:54] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.050 seconds +[03/30 15:40:54] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.051 seconds +[03/30 15:40:56] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008802 seconds +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 15:40:56] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/30 15:40:56] libai INFO: > Start building model... +[03/30 15:40:56] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 15:40:56] libai INFO: >>> done with building model. Building time: 0.169 seconds +[03/30 15:40:56] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 15:40:56] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 15:40:56] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 15:40:58] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 15:40:58] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 15:41:03] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/30 15:41:03] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) +[03/30 16:06:54] libai INFO: Rank of current process: 0. World size: 1 +[03/30 16:06:54] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 16:06:54] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug=1 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 16:06:54] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 16:06:54] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 16:06:54] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.052 seconds +[03/30 16:06:54] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.053 seconds +[03/30 16:06:56] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006865 seconds +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 16:06:56] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/30 16:06:56] libai INFO: > Start building model... +[03/30 16:06:56] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 16:06:56] libai INFO: >>> done with building model. Building time: 0.135 seconds +[03/30 16:06:56] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 16:06:57] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 16:07:23] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0028 s/iter lr: 5.00e-05 +[03/30 16:07:48] lb.utils.events INFO: eta: 99 days, 12:07:57 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0016 s/iter lr: 5.00e-05 +[03/30 16:08:14] lb.utils.events INFO: eta: 99 days, 18:10:44 iteration: 2/335008 consumed_samples: 12 total_loss: 9.44 time: 25.7281 s/iter data_time: 0.0013 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 16:10:06] libai INFO: Rank of current process: 0. World size: 1 +[03/30 16:10:06] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 16:10:06] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug=1 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 16:10:06] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 16:10:06] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 16:10:06] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.052 seconds +[03/30 16:10:06] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.053 seconds +[03/30 16:10:08] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007809 seconds +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 16:10:08] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/30 16:10:08] libai INFO: > Start building model... +[03/30 16:10:08] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 16:10:08] libai INFO: >>> done with building model. Building time: 0.156 seconds +[03/30 16:10:08] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 16:10:09] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 16:10:35] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0018 s/iter lr: 5.00e-05 +[03/30 16:11:01] lb.utils.events INFO: eta: 99 days, 11:08:04 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0014 s/iter lr: 5.00e-05 +[03/30 16:11:26] lb.utils.events INFO: eta: 99 days, 13:30:23 iteration: 2/335008 consumed_samples: 12 total_loss: 9.44 time: 25.6779 s/iter data_time: 0.0012 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 16:11:52] lb.utils.events INFO: eta: 99 days, 5:53:49 iteration: 3/335008 consumed_samples: 16 total_loss: 9.374 time: 25.5962 s/iter data_time: 0.0010 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 16:12:17] libai INFO: Rank of current process: 0. World size: 1 +[03/30 16:12:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 16:12:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug=1 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=1, + test_micro_batch_size=1, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 16:12:17] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 16:12:17] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 16:12:18] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.051 seconds +[03/30 16:12:18] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.052 seconds +[03/30 16:12:19] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007111 seconds +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 16:12:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... +[03/30 16:12:19] lb.data.datasets.gpt_dataset INFO: > last epoch number of samples (977) is smaller than 80% of number of samples per epoch (4511), setting separate_last_epoch to True +[03/30 16:12:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.010740 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.001255 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 9023) and [9023, 13535) ... +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000525 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_doc_idx.npy +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_sample_idx.npy +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_shuffle_idx.npy +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > only one epoch required, setting separate_last_epoch to False +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.003128 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000350 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 4511) and [4511, 4511) ... +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000232 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_doc_idx.npy +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_sample_idx.npy +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_shuffle_idx.npy +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > only one epoch required, setting separate_last_epoch to False +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.003130 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000328 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 4511) and [4511, 4511) ... +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000229 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_doc_idx.npy +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_sample_idx.npy +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_shuffle_idx.npy +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 16:12:20] lb.engine.default INFO: Auto-scaling the config to train.train_iter=446655, train.warmup_iter=0 +[03/30 16:12:20] libai INFO: > Start building model... +[03/30 16:12:20] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 16:12:20] libai INFO: >>> done with building model. Building time: 0.153 seconds +[03/30 16:12:20] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 16:12:20] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 16:12:20] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 16:12:21] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 16:12:21] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 17:28:10] libai INFO: Rank of current process: 0. World size: 1 +[03/30 17:28:10] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 17:28:10] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug=1 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=1, + test_micro_batch_size=1, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 17:28:10] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 17:28:10] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 17:28:10] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.094 seconds +[03/30 17:28:10] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.095 seconds +[03/30 17:28:12] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007207 seconds +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_doc_idx.npy +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_sample_idx.npy +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_shuffle_idx.npy +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_doc_idx.npy +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_sample_idx.npy +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_shuffle_idx.npy +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_doc_idx.npy +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_sample_idx.npy +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_shuffle_idx.npy +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 17:28:12] lb.engine.default INFO: Auto-scaling the config to train.train_iter=446655, train.warmup_iter=0 +[03/30 17:28:12] libai INFO: > Start building model... +[03/30 17:28:12] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 17:28:12] libai INFO: >>> done with building model. Building time: 0.153 seconds +[03/30 17:28:12] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 17:28:12] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 17:28:12] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 17:28:14] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 17:28:14] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 17:28:21] lb.utils.events INFO: iteration: 0/446655 consumed_samples: 1 total_loss: 11.33 data_time: 0.0009 s/iter lr: 5.00e-05 +[03/30 17:28:27] lb.utils.events INFO: eta: 33 days, 6:54:54 iteration: 1/446655 consumed_samples: 2 total_loss: 11.34 data_time: 0.0016 s/iter lr: 5.00e-05 +[03/30 17:28:34] lb.utils.events INFO: eta: 33 days, 3:04:22 iteration: 2/446655 consumed_samples: 3 total_loss: 11.33 time: 6.4083 s/iter data_time: 0.0029 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:28:40] lb.utils.events INFO: eta: 33 days, 3:14:30 iteration: 3/446655 consumed_samples: 4 total_loss: 11.34 time: 6.4096 s/iter data_time: 0.0024 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:28:46] lb.utils.events INFO: eta: 33 days, 3:19:03 iteration: 4/446655 consumed_samples: 5 total_loss: 11.36 time: 6.4098 s/iter data_time: 0.0021 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:28:53] lb.utils.events INFO: eta: 33 days, 3:20:56 iteration: 5/446655 consumed_samples: 6 total_loss: 11.35 time: 6.4101 s/iter data_time: 0.0020 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:28:59] lb.utils.events INFO: eta: 33 days, 3:18:50 iteration: 6/446655 consumed_samples: 7 total_loss: 11.34 time: 6.4101 s/iter data_time: 0.0020 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:29:06] lb.utils.events INFO: eta: 33 days, 3:17:35 iteration: 7/446655 consumed_samples: 8 total_loss: 11.34 time: 6.4013 s/iter data_time: 0.0018 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:32:57] libai INFO: Rank of current process: 0. World size: 1 +[03/30 17:32:57] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 17:32:57] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=2, + test_micro_batch_size=1, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 17:32:57] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 17:32:57] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 17:32:57] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds +[03/30 17:32:57] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.057 seconds +[03/30 17:32:59] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007043 seconds +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > last epoch number of samples (1953) is smaller than 80% of number of samples per epoch (4511), setting separate_last_epoch to True +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.019319 +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.001643 +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 18047) and [18047, 22559) ... +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000756 +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_20000ns_1024sl_1234s_doc_idx.npy +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_20000ns_1024sl_1234s_sample_idx.npy +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_20000ns_1024sl_1234s_shuffle_idx.npy +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of samples: 22560 +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of epochs: 5 +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_doc_idx.npy +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_sample_idx.npy +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_shuffle_idx.npy +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_doc_idx.npy +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_sample_idx.npy +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_shuffle_idx.npy +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 17:32:59] lb.engine.default INFO: Auto-scaling the config to train.train_iter=372224, train.warmup_iter=0 +[03/30 17:32:59] libai INFO: > Start building model... +[03/30 17:32:59] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 17:32:59] libai INFO: >>> done with building model. Building time: 0.156 seconds +[03/30 17:32:59] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 17:33:00] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 17:33:00] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 17:33:06] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/30 17:33:06] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) +[03/30 17:33:37] libai INFO: Rank of current process: 0. World size: 1 +[03/30 17:33:37] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 17:33:37] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=1, + test_micro_batch_size=1, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 17:33:37] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 17:33:37] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 17:33:37] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.080 seconds +[03/30 17:33:37] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.080 seconds +[03/30 17:33:39] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006754 seconds +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_doc_idx.npy +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_sample_idx.npy +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_shuffle_idx.npy +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_doc_idx.npy +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_sample_idx.npy +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_shuffle_idx.npy +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_doc_idx.npy +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_sample_idx.npy +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_shuffle_idx.npy +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 17:33:39] lb.engine.default INFO: Auto-scaling the config to train.train_iter=446655, train.warmup_iter=0 +[03/30 17:33:39] libai INFO: > Start building model... +[03/30 17:33:39] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 17:33:39] libai INFO: >>> done with building model. Building time: 0.161 seconds +[03/30 17:33:39] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 17:33:40] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 17:33:40] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 17:33:47] lb.utils.events INFO: iteration: 0/446655 consumed_samples: 1 total_loss: 11.33 data_time: 0.0012 s/iter lr: 5.00e-05 +[03/30 17:33:53] lb.utils.events INFO: eta: 33 days, 2:28:50 iteration: 1/446655 consumed_samples: 2 total_loss: 11.34 data_time: 0.0011 s/iter lr: 5.00e-05 +[03/30 17:33:59] lb.utils.events INFO: eta: 33 days, 1:36:38 iteration: 2/446655 consumed_samples: 3 total_loss: 11.33 time: 6.3965 s/iter data_time: 0.0011 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:34:06] lb.utils.events INFO: eta: 33 days, 1:55:18 iteration: 3/446655 consumed_samples: 4 total_loss: 11.34 time: 6.3990 s/iter data_time: 0.0012 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:34:12] lb.utils.events INFO: eta: 33 days, 2:13:58 iteration: 4/446655 consumed_samples: 5 total_loss: 11.36 time: 6.4007 s/iter data_time: 0.0013 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:34:19] lb.utils.events INFO: eta: 33 days, 1:55:05 iteration: 5/446655 consumed_samples: 6 total_loss: 11.35 time: 6.3991 s/iter data_time: 0.0013 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:34:25] lb.utils.events INFO: eta: 33 days, 2:13:45 iteration: 6/446655 consumed_samples: 7 total_loss: 11.34 time: 6.4000 s/iter data_time: 0.0012 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:34:31] lb.utils.events INFO: eta: 33 days, 2:07:00 iteration: 7/446655 consumed_samples: 8 total_loss: 11.34 time: 6.4000 s/iter data_time: 0.0012 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:34:38] lb.utils.events INFO: eta: 33 days, 2:00:16 iteration: 8/446655 consumed_samples: 9 total_loss: 11.34 time: 6.3995 s/iter data_time: 0.0014 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:34:44] lb.utils.events INFO: eta: 33 days, 1:55:06 iteration: 9/446655 consumed_samples: 10 total_loss: 11.34 time: 6.3994 s/iter data_time: 0.0015 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:34:51] lb.utils.events INFO: eta: 33 days, 1:49:55 iteration: 10/446655 consumed_samples: 11 total_loss: 11.34 time: 6.3990 s/iter data_time: 0.0015 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:34:57] lb.utils.events INFO: eta: 33 days, 1:54:06 iteration: 11/446655 consumed_samples: 12 total_loss: 11.34 time: 6.3990 s/iter data_time: 0.0015 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:35:03] lb.utils.events INFO: eta: 33 days, 1:49:42 iteration: 12/446655 consumed_samples: 13 total_loss: 11.33 time: 6.3988 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:35:10] lb.utils.events INFO: eta: 33 days, 1:53:53 iteration: 13/446655 consumed_samples: 14 total_loss: 11.33 time: 6.3991 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:35:16] lb.utils.events INFO: eta: 33 days, 1:55:44 iteration: 14/446655 consumed_samples: 15 total_loss: 11.33 time: 6.3991 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:35:23] lb.utils.events INFO: eta: 33 days, 1:52:31 iteration: 15/446655 consumed_samples: 16 total_loss: 11.33 time: 6.3990 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:35:29] lb.utils.events INFO: eta: 33 days, 1:55:32 iteration: 16/446655 consumed_samples: 17 total_loss: 11.33 time: 6.3992 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:35:35] lb.utils.events INFO: eta: 33 days, 1:52:18 iteration: 17/446655 consumed_samples: 18 total_loss: 11.33 time: 6.3990 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:35:42] lb.utils.events INFO: eta: 33 days, 1:55:19 iteration: 18/446655 consumed_samples: 19 total_loss: 11.33 time: 6.3992 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:35:48] lb.utils.events INFO: eta: 33 days, 1:56:22 iteration: 19/446655 consumed_samples: 20 total_loss: 11.33 time: 6.3993 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:35:55] lb.utils.events INFO: eta: 33 days, 1:55:06 iteration: 20/446655 consumed_samples: 21 total_loss: 11.33 time: 6.3993 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:36:01] lb.utils.events INFO: eta: 33 days, 1:55:02 iteration: 21/446655 consumed_samples: 22 total_loss: 11.33 time: 6.3993 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:36:07] lb.utils.events INFO: eta: 33 days, 1:54:59 iteration: 22/446655 consumed_samples: 23 total_loss: 11.33 time: 6.3993 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:36:14] lb.utils.events INFO: eta: 33 days, 1:54:56 iteration: 23/446655 consumed_samples: 24 total_loss: 11.33 time: 6.3993 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:36:20] lb.utils.events INFO: eta: 33 days, 1:54:46 iteration: 24/446655 consumed_samples: 25 total_loss: 11.33 time: 6.3992 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:36:27] lb.utils.events INFO: eta: 33 days, 1:54:37 iteration: 25/446655 consumed_samples: 26 total_loss: 11.33 time: 6.3990 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:36:33] lb.utils.events INFO: eta: 33 days, 1:54:28 iteration: 26/446655 consumed_samples: 27 total_loss: 11.33 time: 6.3989 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:36:39] lb.utils.events INFO: eta: 33 days, 1:53:04 iteration: 27/446655 consumed_samples: 28 total_loss: 11.33 time: 6.3989 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:36:46] lb.utils.events INFO: eta: 33 days, 1:51:41 iteration: 28/446655 consumed_samples: 29 total_loss: 11.33 time: 6.3988 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 17:36:52] lb.utils.events INFO: eta: 33 days, 1:50:08 iteration: 29/446655 consumed_samples: 30 total_loss: 11.33 time: 6.3987 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 +[03/30 18:18:17] libai INFO: Rank of current process: 0. World size: 4 +[03/30 18:18:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 18:18:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 18:18:17] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/30 18:18:17] lb.engine.default INFO: > compiling dataset index builder ... +[03/30 18:18:17] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.088 seconds +[03/30 18:18:17] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.089 seconds +[03/30 18:18:19] lb.engine.default INFO: Prepare training, validating, testing set +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007127 seconds +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > last epoch number of samples (2084) is smaller than 80% of number of samples per epoch (4511), setting separate_last_epoch to True +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.142318 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.008270 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 157916) and [157916, 162428) ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.004275 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > last epoch number of samples (2977) is smaller than 80% of number of samples per epoch (4511), setting separate_last_epoch to True +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.010095 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000795 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 9023) and [9023, 13535) ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000490 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > only one epoch required, setting separate_last_epoch to False +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.003151 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000337 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 4511) and [4511, 4511) ... +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000237 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 18:18:19] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/30 18:18:20] libai INFO: > Start building model... +[03/30 18:18:25] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 18:18:25] libai INFO: >>> done with building model. Building time: 5.322 seconds +[03/30 18:18:25] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 18:18:25] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 18:18:25] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 18:18:26] lb.engine.trainer INFO: Starting training from iteration 0 +[03/30 18:18:26] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 18:18:35] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/30 18:18:35] lb.engine.hooks INFO: Total training time: 0:00:08 (0:00:00 on hooks) +[03/31 14:31:38] libai INFO: Rank of current process: 0. World size: 1 +[03/31 14:31:38] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:31:38] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:31:38] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/31 14:31:38] lb.engine.default INFO: > compiling dataset index builder ... +[03/31 14:31:38] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.050 seconds +[03/31 14:31:38] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.051 seconds +[03/31 14:31:40] lb.engine.default INFO: Prepare training, validating, testing set +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007057 seconds +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:31:40] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:31:40] libai INFO: > Start building model... +[03/31 14:31:41] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:31:41] libai INFO: >>> done with building model. Building time: 0.162 seconds +[03/31 14:31:41] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:31:41] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:31:41] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:31:42] lb.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:31:42] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/31 14:31:48] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/31 14:31:48] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) +[03/31 14:32:17] libai INFO: Rank of current process: 0. World size: 4 +[03/31 14:32:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:32:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:32:17] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/31 14:32:17] lb.engine.default INFO: > compiling dataset index builder ... +[03/31 14:32:17] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.043 seconds +[03/31 14:32:17] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.044 seconds +[03/31 14:32:19] lb.engine.default INFO: Prepare training, validating, testing set +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006932 seconds +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.010 seconds +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:32:20] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:32:20] libai INFO: > Start building model... +[03/31 14:40:48] libai INFO: Rank of current process: 0. World size: 4 +[03/31 14:40:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:40:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:40:48] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/31 14:40:48] lb.engine.default INFO: > compiling dataset index builder ... +[03/31 14:40:48] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.048 seconds +[03/31 14:40:48] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.049 seconds +[03/31 14:40:50] lb.engine.default INFO: Prepare training, validating, testing set +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007409 seconds +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.013 seconds +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:40:50] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:40:50] libai INFO: > Start building model... +[03/31 14:40:51] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:40:51] libai INFO: >>> done with building model. Building time: 0.843 seconds +[03/31 14:40:51] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:40:51] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:40:51] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:40:52] lb.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:40:52] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/31 14:41:01] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/31 14:41:01] lb.engine.hooks INFO: Total training time: 0:00:08 (0:00:00 on hooks) +[03/31 14:54:05] libai INFO: Rank of current process: 0. World size: 4 +[03/31 14:54:05] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:54:06] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:54:06] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/31 14:54:06] lb.engine.default INFO: > compiling dataset index builder ... +[03/31 14:54:06] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds +[03/31 14:54:06] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.058 seconds +[03/31 14:54:08] lb.engine.default INFO: Prepare training, validating, testing set +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.009007 seconds +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.013 seconds +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:54:08] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:54:08] libai INFO: > Start building model... +[03/31 14:54:09] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:54:09] libai INFO: >>> done with building model. Building time: 0.872 seconds +[03/31 14:54:09] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:54:09] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:54:09] lb.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:54:10] lb.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:54:10] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/31 14:54:19] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/31 14:54:19] lb.engine.hooks INFO: Total training time: 0:00:09 (0:00:00 on hooks) +[03/31 14:57:13] libai INFO: Rank of current process: 0. World size: 4 +[03/31 14:57:13] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:57:14] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:57:14] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/31 14:57:14] lb.engine.default INFO: > compiling dataset index builder ... +[03/31 14:57:14] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds +[03/31 14:57:14] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.058 seconds +[03/31 14:57:16] lb.engine.default INFO: Prepare training, validating, testing set +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008085 seconds +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.002 seconds +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.002 seconds +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:57:16] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:57:16] libai INFO: > Start building model... +[03/31 14:57:17] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:57:17] libai INFO: >>> done with building model. Building time: 0.885 seconds +[03/31 14:57:17] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:57:18] lb.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:57:47] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 16 total_loss: 10.86 data_time: 0.0064 s/iter lr: 5.00e-05 +[03/31 14:59:32] libai INFO: Rank of current process: 0. World size: 4 +[03/31 14:59:32] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:59:32] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:59:33] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[03/31 14:59:33] lb.engine.default INFO: > compiling dataset index builder ... +[03/31 14:59:33] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.088 seconds +[03/31 14:59:33] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.090 seconds +[03/31 14:59:35] lb.engine.default INFO: Prepare training, validating, testing set +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008213 seconds +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.011 seconds +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:59:35] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:59:35] libai INFO: > Start building model... +[03/31 14:59:36] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:59:36] libai INFO: >>> done with building model. Building time: 0.831 seconds +[03/31 14:59:36] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:59:37] lb.engine.trainer INFO: Starting training from iteration 0 +[03/31 15:00:06] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 16 total_loss: 10.86 data_time: 0.0058 s/iter lr: 5.00e-05 diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank1 b/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank1 new file mode 100644 index 000000000..dd0d51b6e --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank1 @@ -0,0 +1,2084 @@ +[03/30 18:18:17] libai INFO: Rank of current process: 1. World size: 4 +[03/30 18:18:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 18:18:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 18:18:19] libai.engine.default INFO: Prepare training, validating, testing set +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007530 seconds +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.008 seconds +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 18:18:20] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/30 18:18:20] libai INFO: > Start building model... +[03/30 18:18:25] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 18:18:25] libai INFO: >>> done with building model. Building time: 5.321 seconds +[03/30 18:18:25] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 18:18:25] libai.engine.trainer INFO: Starting training from iteration 0 +[03/30 18:18:26] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 18:18:35] libai.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/30 18:18:35] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) +[03/31 14:32:17] libai INFO: Rank of current process: 1. World size: 4 +[03/31 14:32:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:32:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:32:19] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008707 seconds +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.002 seconds +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:32:20] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:32:20] libai INFO: > Start building model... +[03/31 14:40:48] libai INFO: Rank of current process: 1. World size: 4 +[03/31 14:40:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:40:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:40:50] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007467 seconds +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:40:50] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:40:50] libai INFO: > Start building model... +[03/31 14:40:51] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:40:51] libai INFO: >>> done with building model. Building time: 0.843 seconds +[03/31 14:40:51] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:40:51] libai.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:40:52] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/31 14:41:01] libai.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/31 14:41:01] libai.engine.hooks INFO: Total training time: 0:00:09 (0:00:00 on hooks) +[03/31 14:54:05] libai INFO: Rank of current process: 1. World size: 4 +[03/31 14:54:05] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:54:06] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:54:08] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007189 seconds +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.013 seconds +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:54:08] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:54:08] libai INFO: > Start building model... +[03/31 14:54:09] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:54:09] libai INFO: >>> done with building model. Building time: 0.872 seconds +[03/31 14:54:09] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:54:09] libai.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:54:10] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/31 14:54:20] libai.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/31 14:54:20] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) +[03/31 14:57:13] libai INFO: Rank of current process: 1. World size: 4 +[03/31 14:57:13] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:57:14] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:57:16] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008388 seconds +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.015 seconds +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:57:16] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:57:16] libai INFO: > Start building model... +[03/31 14:57:17] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:57:17] libai INFO: >>> done with building model. Building time: 0.885 seconds +[03/31 14:57:17] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:57:17] libai.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:59:32] libai INFO: Rank of current process: 1. World size: 4 +[03/31 14:59:32] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:59:32] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:59:35] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007985 seconds +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.008 seconds +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.002 seconds +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:59:35] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:59:35] libai INFO: > Start building model... +[03/31 14:59:36] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:59:36] libai INFO: >>> done with building model. Building time: 0.831 seconds +[03/31 14:59:36] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:59:36] libai.engine.trainer INFO: Starting training from iteration 0 diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank2 b/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank2 new file mode 100644 index 000000000..042069405 --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank2 @@ -0,0 +1,2084 @@ +[03/30 18:18:17] libai INFO: Rank of current process: 2. World size: 4 +[03/30 18:18:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 18:18:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 18:18:19] libai.engine.default INFO: Prepare training, validating, testing set +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008698 seconds +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.014 seconds +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 18:18:19] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/30 18:18:20] libai INFO: > Start building model... +[03/30 18:18:25] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 18:18:25] libai INFO: >>> done with building model. Building time: 5.322 seconds +[03/30 18:18:25] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 18:18:25] libai.engine.trainer INFO: Starting training from iteration 0 +[03/30 18:18:26] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 18:18:35] libai.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/30 18:18:35] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) +[03/31 14:32:17] libai INFO: Rank of current process: 2. World size: 4 +[03/31 14:32:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:32:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:32:19] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006944 seconds +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.014 seconds +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:32:20] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:32:20] libai INFO: > Start building model... +[03/31 14:40:48] libai INFO: Rank of current process: 2. World size: 4 +[03/31 14:40:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:40:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:40:50] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007936 seconds +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:40:50] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:40:50] libai INFO: > Start building model... +[03/31 14:40:51] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:40:51] libai INFO: >>> done with building model. Building time: 0.843 seconds +[03/31 14:40:51] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:40:51] libai.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:40:52] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/31 14:41:01] libai.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/31 14:41:01] libai.engine.hooks INFO: Total training time: 0:00:09 (0:00:00 on hooks) +[03/31 14:54:05] libai INFO: Rank of current process: 2. World size: 4 +[03/31 14:54:05] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:54:06] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:54:08] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007197 seconds +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.010 seconds +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:54:08] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:54:08] libai INFO: > Start building model... +[03/31 14:54:09] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:54:09] libai INFO: >>> done with building model. Building time: 0.872 seconds +[03/31 14:54:09] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:54:09] libai.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:54:10] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/31 14:54:19] libai.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/31 14:54:19] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) +[03/31 14:57:13] libai INFO: Rank of current process: 2. World size: 4 +[03/31 14:57:13] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:57:14] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:57:16] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008128 seconds +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.012 seconds +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:57:16] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:57:16] libai INFO: > Start building model... +[03/31 14:57:17] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:57:17] libai INFO: >>> done with building model. Building time: 0.885 seconds +[03/31 14:57:17] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:57:17] libai.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:59:32] libai INFO: Rank of current process: 2. World size: 4 +[03/31 14:59:32] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:59:32] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:59:35] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007110 seconds +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:59:35] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:59:35] libai INFO: > Start building model... +[03/31 14:59:36] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:59:36] libai INFO: >>> done with building model. Building time: 0.831 seconds +[03/31 14:59:36] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:59:36] libai.engine.trainer INFO: Starting training from iteration 0 diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank3 b/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank3 new file mode 100644 index 000000000..75fd65b0c --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank3 @@ -0,0 +1,2084 @@ +[03/30 18:18:17] libai INFO: Rank of current process: 3. World size: 4 +[03/30 18:18:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/30 18:18:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/30 18:18:19] libai.engine.default INFO: Prepare training, validating, testing set +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007484 seconds +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.014 seconds +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/30 18:18:19] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/30 18:18:20] libai INFO: > Start building model... +[03/30 18:18:25] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/30 18:18:25] libai INFO: >>> done with building model. Building time: 5.321 seconds +[03/30 18:18:25] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/30 18:18:25] libai.engine.trainer INFO: Starting training from iteration 0 +[03/30 18:18:26] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/30 18:18:35] libai.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/30 18:18:35] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) +[03/31 14:32:17] libai INFO: Rank of current process: 3. World size: 4 +[03/31 14:32:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:32:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:32:19] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006762 seconds +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.011 seconds +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:32:20] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:32:20] libai INFO: > Start building model... +[03/31 14:40:48] libai INFO: Rank of current process: 3. World size: 4 +[03/31 14:40:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:40:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:40:50] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008170 seconds +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.011 seconds +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:40:50] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:40:50] libai INFO: > Start building model... +[03/31 14:40:51] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:40:51] libai INFO: >>> done with building model. Building time: 0.843 seconds +[03/31 14:40:51] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:40:51] libai.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:40:52] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/31 14:41:01] libai.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/31 14:41:01] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) +[03/31 14:54:05] libai INFO: Rank of current process: 3. World size: 4 +[03/31 14:54:05] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:54:06] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=True +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:54:08] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006892 seconds +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:54:08] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:54:08] libai INFO: > Start building model... +[03/31 14:54:09] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:54:09] libai INFO: >>> done with building model. Building time: 0.872 seconds +[03/31 14:54:09] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. +[03/31 14:54:09] libai.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:54:10] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... +[03/31 14:54:19] libai.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step + loss_dict = self.graph(**data) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ + self._compile(*args, **kwargs) + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile + self.finish_compile_and_init_runtime() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime + self._c_nn_graph.init_runtime() +oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate + device->Alloc(options, &ptr, size) +Error Type: oneflow.ErrorProto.runtime_error + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() + +Error Type: oneflow.ErrorProto.runtime_error + +[03/31 14:54:19] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) +[03/31 14:57:13] libai INFO: Rank of current process: 3. World size: 4 +[03/31 14:57:13] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:57:13] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:57:16] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006980 seconds +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.014 seconds +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:57:16] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:57:16] libai INFO: > Start building model... +[03/31 14:57:17] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:57:17] libai INFO: >>> done with building model. Building time: 0.885 seconds +[03/31 14:57:17] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:57:17] libai.engine.trainer INFO: Starting training from iteration 0 +[03/31 14:59:32] libai INFO: Rank of current process: 3. World size: 4 +[03/31 14:59:32] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[03/31 14:59:32] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[03/31 14:59:35] libai.engine.default INFO: Prepare training, validating, testing set +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: building dataset index ... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading sizes... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading pointers... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading document index... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007580 seconds +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.009 seconds +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[03/31 14:59:35] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[03/31 14:59:35] libai INFO: > Start building model... +[03/31 14:59:36] libai.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[03/31 14:59:36] libai INFO: >>> done with building model. Building time: 0.831 seconds +[03/31 14:59:36] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[03/31 14:59:36] libai.engine.trainer INFO: Starting training from iteration 0 diff --git a/projects/MagicPrompt/oneflow_magicprompt/metrics.json b/projects/MagicPrompt/oneflow_magicprompt/metrics.json new file mode 100644 index 000000000..3ab61a6d4 --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/metrics.json @@ -0,0 +1,65 @@ +{"data_time": 0.0014411650045076385, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} +{"data_time": 0.0009524804991087876, "iteration": 1, "lr": 5e-05, "total_loss": 10.083935260772705} +{"data_time": 0.001286156999412924, "eta_seconds": 8649448.917897075, "iteration": 2, "lr": 5e-05, "time": 25.818865144989104, "total_loss": 9.440001487731934} +{"data_time": 0.0009059479998541065, "eta_seconds": 8575895.740720144, "iteration": 3, "lr": 5e-05, "time": 25.5993831139931, "total_loss": 9.373700618743896} +{"data_time": 0.0010666789894457906, "eta_seconds": 8631602.345258702, "iteration": 4, "lr": 5e-05, "time": 25.765746411998407, "total_loss": 9.30739974975586} +{"data_time": 0.0007962089948705398, "eta_seconds": 8618618.126283506, "iteration": 5, "lr": 5e-05, "time": 25.727064692997374, "total_loss": 9.212128639221191} +{"data_time": 0.0005257390002952889, "eta_seconds": 8619872.805872891, "iteration": 6, "lr": 5e-05, "time": 25.730886791003286, "total_loss": 9.116857528686523} +{"data_time": 0.000506085496454034, "eta_seconds": 8625686.061502784, "iteration": 7, "lr": 5e-05, "time": 25.748316601500846, "total_loss": 9.09308910369873} +{"data_time": 0.0005257390002952889, "eta_seconds": 8619821.34409931, "iteration": 8, "lr": 5e-05, "time": 25.730886791003286, "total_loss": 9.069320678710938} +{"data_time": 0.0006622090004384518, "eta_seconds": 8612676.266367672, "iteration": 9, "lr": 5e-05, "time": 25.709634882499813, "total_loss": 8.98242473602295} +{"data_time": 0.0007986790005816147, "eta_seconds": 8605531.231139852, "iteration": 10, "lr": 5e-05, "time": 25.68838297399634, "total_loss": 8.895528793334961} +{"data_time": 0.0007918894989416003, "eta_seconds": 8612624.847097907, "iteration": 11, "lr": 5e-05, "time": 25.709634882499813, "total_loss": 8.853058815002441} +{"data_time": 0.000785099997301586, "eta_seconds": 8617175.898946738, "iteration": 12, "lr": 5e-05, "time": 25.72329706099117, "total_loss": 8.810588836669922} +{"data_time": 0.0007462590001523495, "eta_seconds": 8618421.432657516, "iteration": 13, "lr": 5e-05, "time": 25.72709192599723, "total_loss": 8.789298057556152} +{"data_time": 0.0007268470071721822, "eta_seconds": 8619666.958778564, "iteration": 14, "lr": 5e-05, "time": 25.730886791003286, "total_loss": 8.768007278442383} +{"data_time": 0.0007467715040547773, "eta_seconds": 8618369.978473663, "iteration": 15, "lr": 5e-05, "time": 25.72709192599723, "total_loss": 8.735990524291992} +{"data_time": 0.0007666960009373724, "eta_seconds": 8619615.497004982, "iteration": 16, "lr": 5e-05, "time": 25.730886791003286, "total_loss": 8.703973770141602} +{"data_time": 0.0007619214957230724, "eta_seconds": 8622145.699786797, "iteration": 17, "lr": 5e-05, "time": 25.73851667150302, "total_loss": 8.69331693649292} +{"data_time": 0.002807431999826804, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} +{"data_time": 0.0016498550030519255, "iteration": 1, "lr": 5e-05, "total_loss": 10.083935260772705} +{"data_time": 0.0005120069981785491, "eta_seconds": 8619044.69123013, "iteration": 2, "lr": 5e-05, "time": 25.72810761400615, "total_loss": 9.440001487731934} +{"data_time": 0.00180548700154759, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} +{"data_time": 0.0014332204955280758, "iteration": 1, "lr": 5e-05, "total_loss": 10.083935260772705} +{"data_time": 0.0010609539895085618, "eta_seconds": 8602223.053338168, "iteration": 2, "lr": 5e-05, "time": 25.67789451900171, "total_loss": 9.440001487731934} +{"data_time": 0.0009005849933600985, "eta_seconds": 8574829.753975058, "iteration": 3, "lr": 5e-05, "time": 25.59620110200194, "total_loss": 9.373700618743896} +{"data_time": 0.0009029859938891605, "iteration": 0, "lr": 5e-05, "total_loss": 11.327921867370605} +{"data_time": 0.0016307824989780784, "iteration": 1, "lr": 5e-05, "total_loss": 11.342677593231201} +{"data_time": 0.0023585790040669963, "eta_seconds": 2862262.2889005477, "iteration": 2, "lr": 5e-05, "time": 6.408260321011767, "total_loss": 11.329483985900879} +{"data_time": 0.0017029235023073852, "eta_seconds": 2862870.7064581364, "iteration": 3, "lr": 5e-05, "time": 6.40963684500457, "total_loss": 11.343458652496338} +{"data_time": 0.0010472680005477741, "eta_seconds": 2863143.539721141, "iteration": 4, "lr": 5e-05, "time": 6.41026203900401, "total_loss": 11.35576343536377} +{"data_time": 0.0011308644970995374, "eta_seconds": 2863256.0381426434, "iteration": 5, "lr": 5e-05, "time": 6.41052826300438, "total_loss": 11.345033168792725} +{"data_time": 0.0012144609936513007, "eta_seconds": 2863130.7191970632, "iteration": 6, "lr": 5e-05, "time": 6.41026203900401, "total_loss": 11.338150978088379} +{"data_time": 0.0011308644970995374, "eta_seconds": 2863055.6510274047, "iteration": 7, "lr": 5e-05, "time": 6.4101083205023315, "total_loss": 11.340061664581299} +{"data_time": 0.001176661011413671, "iteration": 0, "lr": 5e-05, "total_loss": 11.327921867370605} +{"data_time": 0.0011061955010518432, "iteration": 1, "lr": 5e-05, "total_loss": 11.342677593231201} +{"data_time": 0.0011234899866394699, "eta_seconds": 2856998.9962181174, "iteration": 2, "lr": 5e-05, "time": 6.396476442998392, "total_loss": 11.329483985900879} +{"data_time": 0.0011500754990265705, "eta_seconds": 2858118.5685008415, "iteration": 3, "lr": 5e-05, "time": 6.398997356998734, "total_loss": 11.343458652496338} +{"data_time": 0.001176661011413671, "eta_seconds": 2859238.1357417377, "iteration": 4, "lr": 5e-05, "time": 6.401518270999077, "total_loss": 11.35576343536377} +{"data_time": 0.0012264845063327812, "eta_seconds": 2858105.7705061277, "iteration": 5, "lr": 5e-05, "time": 6.398997356998734, "total_loss": 11.345033168792725} +{"data_time": 0.001176661011413671, "eta_seconds": 2859225.3327051955, "iteration": 6, "lr": 5e-05, "time": 6.401518270999077, "total_loss": 11.338150978088379} +{"data_time": 0.0011500754990265705, "eta_seconds": 2858820.9874694482, "iteration": 7, "lr": 5e-05, "time": 6.400627312999859, "total_loss": 11.340061664581299} +{"data_time": 0.001176661011413671, "eta_seconds": 2858416.6440156163, "iteration": 8, "lr": 5e-05, "time": 6.399736355000641, "total_loss": 11.338150978088379} +{"data_time": 0.0012264845063327812, "eta_seconds": 2858106.114542271, "iteration": 9, "lr": 5e-05, "time": 6.399055434500042, "total_loss": 11.340061664581299} +{"data_time": 0.0012763080012518913, "eta_seconds": 2857795.586430767, "iteration": 10, "lr": 5e-05, "time": 6.398374513999443, "total_loss": 11.338150978088379} +{"data_time": 0.001407306503097061, "eta_seconds": 2858046.0265380414, "iteration": 11, "lr": 5e-05, "time": 6.398949555994477, "total_loss": 11.33622694015503} +{"data_time": 0.0015383050049422309, "eta_seconds": 2857782.789681739, "iteration": 12, "lr": 5e-05, "time": 6.398374513999443, "total_loss": 11.33430290222168} +{"data_time": 0.001407306503097061, "eta_seconds": 2858033.2286389293, "iteration": 13, "lr": 5e-05, "time": 6.398949555994477, "total_loss": 11.332839012145996} +{"data_time": 0.0012763080012518913, "eta_seconds": 2858144.8762013507, "iteration": 14, "lr": 5e-05, "time": 6.399213855009293, "total_loss": 11.331375122070312} +{"data_time": 0.0012507934952736832, "eta_seconds": 2857951.0357728465, "iteration": 15, "lr": 5e-05, "time": 6.398794184504368, "total_loss": 11.332839012145996} +{"data_time": 0.0012308660079725087, "eta_seconds": 2858132.077773641, "iteration": 16, "lr": 5e-05, "time": 6.399213855009293, "total_loss": 11.331375122070312} +{"data_time": 0.0012535870046122, "eta_seconds": 2857938.238184477, "iteration": 17, "lr": 5e-05, "time": 6.398794184504368, "total_loss": 11.330994129180908} +{"data_time": 0.0012763080012518913, "eta_seconds": 2858119.2793459306, "iteration": 18, "lr": 5e-05, "time": 6.399213855009293, "total_loss": 11.330613136291504} +{"data_time": 0.0012535870046122, "eta_seconds": 2858182.2744775605, "iteration": 19, "lr": 5e-05, "time": 6.399369226499402, "total_loss": 11.330994129180908} +{"data_time": 0.0012953120021848008, "eta_seconds": 2858106.4809182207, "iteration": 20, "lr": 5e-05, "time": 6.399213855009293, "total_loss": 11.330994129180908} +{"data_time": 0.0013658775060321204, "eta_seconds": 2858102.8979477035, "iteration": 21, "lr": 5e-05, "time": 6.399220160506957, "total_loss": 11.330048561096191} +{"data_time": 0.0014778720069443807, "eta_seconds": 2858099.3149645757, "iteration": 22, "lr": 5e-05, "time": 6.399375531997066, "total_loss": 11.326469898223877} +{"data_time": 0.0013658775060321204, "eta_seconds": 2858096.160288107, "iteration": 23, "lr": 5e-05, "time": 6.399233730502601, "total_loss": 11.321541786193848} +{"data_time": 0.0013658775060321204, "eta_seconds": 2858086.516511644, "iteration": 24, "lr": 5e-05, "time": 6.399220160506957, "total_loss": 11.321541786193848} +{"data_time": 0.0013658775060321204, "eta_seconds": 2858077.3010670617, "iteration": 25, "lr": 5e-05, "time": 6.399220160506957, "total_loss": 11.320509910583496} +{"data_time": 0.0015193385042948648, "eta_seconds": 2858068.0856350907, "iteration": 26, "lr": 5e-05, "time": 6.398848810007621, "total_loss": 11.320509910583496} +{"data_time": 0.001564129997859709, "eta_seconds": 2857984.682117961, "iteration": 27, "lr": 5e-05, "time": 6.3986763970024185, "total_loss": 11.316842079162598} +{"data_time": 0.001564129997859709, "eta_seconds": 2857901.2789456574, "iteration": 28, "lr": 5e-05, "time": 6.3986763970024185, "total_loss": 11.313267707824707} +{"data_time": 0.0014622305025113747, "eta_seconds": 2857808.845811205, "iteration": 29, "lr": 5e-05, "time": 6.3986763970024185, "total_loss": 11.31224012374878} +{"data_time": 0.006361130013829097, "iteration": 0, "lr": 5e-05, "total_loss": 10.858524322509766} +{"data_time": 0.005785448011010885, "iteration": 0, "lr": 5e-05, "total_loss": 10.858524322509766} From 338177b3d60980a6c17824187733de22a9dfd3f2 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Fri, 31 Mar 2023 15:02:22 +0800 Subject: [PATCH 11/25] refine --- .../oneflow_magicprompt/config.yaml | 76 - .../events.out.tfevents.1680146933.oneflow-32 | Bin 3920 -> 0 bytes .../events.out.tfevents.1680147471.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1680148160.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1680148191.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1680162058.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1680163617.oneflow-32 | 0 .../events.out.tfevents.1680163809.oneflow-32 | 0 .../events.out.tfevents.1680163941.oneflow-32 | 0 .../events.out.tfevents.1680168494.oneflow-32 | 0 .../events.out.tfevents.1680168780.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1680168820.oneflow-32 | Bin 3920 -> 0 bytes .../events.out.tfevents.1680171506.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1680244302.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1680244852.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1680245650.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1680245838.oneflow-32 | 0 .../events.out.tfevents.1680245977.oneflow-32 | 0 .../MagicPrompt/oneflow_magicprompt/log.txt | 6904 ----------------- .../oneflow_magicprompt/log.txt.rank1 | 2084 ----- .../oneflow_magicprompt/log.txt.rank2 | 2084 ----- .../oneflow_magicprompt/log.txt.rank3 | 2084 ----- .../oneflow_magicprompt/metrics.json | 65 - 23 files changed, 13297 deletions(-) delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/config.yaml delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680146933.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680147471.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148160.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148191.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680162058.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163617.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163809.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163941.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168494.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168780.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168820.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680171506.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680244302.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680244852.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245650.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245838.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245977.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt.rank1 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt.rank2 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt.rank3 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/metrics.json diff --git a/projects/MagicPrompt/oneflow_magicprompt/config.yaml b/projects/MagicPrompt/oneflow_magicprompt/config.yaml deleted file mode 100644 index 61b473e5c..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/config.yaml +++ /dev/null @@ -1,76 +0,0 @@ -dataloader: - train: - _target_: libai.data.build_nlp_train_val_test_loader - dataset: - - _target_: libai.data.datasets.GPT2Dataset - data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence - indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} - max_seq_length: 1024 - name: gpt-2 - seed: 1234 - num_workers: 4 - splits: - - [949.0, 50.0, 1.0] - train_val_test_num_samples: null - weights: [1.0] -ds: - _target_: libai.data.datasets.GPT2Dataset - data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence - indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} - max_seq_length: 1024 - name: gpt-2 - seed: 1234 -graph: - auto_parallel: {enable_auto_parallel_ignore_user_sbp_config: false, enabled: false, sbp_collector: false, trunk_algo: true} - debug: 2 - enabled: false - eval_graph: {_target_: libai.models.utils.GraphBase, is_train: false} - train_graph: {_target_: libai.models.utils.GraphBase, is_train: true} -model: - _target_: projects.MagicPrompt.gpt2.GPTForPreTraining - cfg: {amp_enabled: false, apply_query_key_layer_scaling: true, apply_residual_post_layernorm: false, attention_dropout_prob: 0.1, bias_dropout_fusion: false, bias_gelu_fusion: false, bos_token_id: 50256, chunk_size_feed_forward: 0, decoder_start_token_id: null, diversity_penalty: 0.0, do_sample: false, early_stopping: false, embedding_dropout_prob: 0.1, encoder_no_repeat_ngram_size: 0, eos_token_id: 50256, exponential_decay_length_penalty: null, ffn_hidden_size: 3072, forced_bos_token_id: null, forced_eos_token_id: null, hidden_layers: 12, hidden_size: 768, initializer_range: 0.02, is_encoder_decoder: false, layernorm_epsilon: 1.0e-05, length_penalty: 1.0, max_length: 20, max_seq_length: 1024, min_length: 0, no_repeat_ngram_size: 0, num_attention_heads: 12, num_beam_groups: 1, num_beams: 1, num_return_sequences: 1, output_dropout_prob: 0.1, output_scores: false, pad_token_id: 0, pretrained_model_path: null, remove_invalid_values: false, repetition_penalty: 1.0, scale_mask_softmax_fusion: false, sep_token_id: null, temperature: 1.0, top_k: 50, top_p: 1.0, typical_p: 1.0, use_cache: true, use_scaled_init_for_output_weights: true, vocab_size: 50257} -optim: - _target_: oneflow.nn.optimizer.adamw.AdamW - betas: [0.9, 0.999] - do_bias_correction: true - eps: 1.0e-08 - lr: 5.0e-05 - params: {_target_: libai.optim.get_default_optimizer_params, clip_grad_max_norm: null, clip_grad_norm_type: null, weight_decay_bias: 0.0, weight_decay_norm: 0.0} - weight_decay: 0.01 -tokenization: - append_eod: false - make_vocab_size_divisible_by: 128 - tokenizer: {_target_: libai.tokenizer.GPT2Tokenizer, do_chinese_wwm: true, do_lower_case: true, merges_file: /home/zhangxiaoyu/magicprompt/merges.txt, vocab_file: /home/zhangxiaoyu/magicprompt/vocab.json} -train: - activation_checkpoint: {enabled: false} - amp: {enabled: false} - checkpointer: {max_to_keep: 20, period: 8000} - consumed_train_samples: 0 - consumed_valid_samples: 0 - dist: {data_parallel_size: 4, num_gpus_per_node: 4, num_nodes: 1, pipeline_num_layers: 10000, pipeline_parallel_size: 1, tensor_parallel_size: 1} - evaluation: - enabled: true - eval_iter: 250 - eval_period: 4000 - evaluator: {_target_: libai.evaluation.PPLEvaluator} - global_batch_size: 16 - input_placement_device: cpu - load_weight: '' - log_period: 1 - nccl_fusion_max_ops: 24 - nccl_fusion_threshold_mb: 16 - num_accumulation_steps: 1 - output_dir: projects/MagicPrompt/oneflow_magicprompt - rdma_enabled: false - resume: false - samples: 160000 - scheduler: {_target_: libai.scheduler.WarmupExponentialLR, gamma: 1.0, warmup_factor: 0.0, warmup_iter: 0.0, warmup_method: linear} - seed: 1234 - start_iter: 0 - test_micro_batch_size: 4 - train_epoch: 33 - train_iter: 10000 - train_micro_batch_size: 4 - train_samples: null - warmup_ratio: 0 - zero_optimization: {enabled: false, stage: 1} diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680146933.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680146933.oneflow-32 deleted file mode 100644 index 6481a1dc7430a67b9cdba76a73d0266709fcdf89..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3920 zcmZYAdr(w$6u@z1LArN&>@M;U4M8!~*fet#FEhvw(6pQ2QjISVm!Odm*$_0vFu{}* zNmNu!KooO?MkQtdGZ6$u1eP}iiVFBh!$-;j`JC9>&hK~6@7}+^bLM>RnceF&;{PY_ zJAMM=vus(SFVl&ft;5%8qhdaow;?hno|~b}j?R78)UBF!dm__ZR=UmDO;xc9mWj~D zYs2EB)<)WIx_5ya@br7(Vv0MvR5O?{)uz~vRkCcnE?yfQ7OjhmvroLNMpu6tighs6 zLE4LxnZ8C-EoT|lEP9>2W2wd~RG&I$ys6Ug0ZxoZ`P5y;1r#Ic6?)HZ;xcQdZ*eu{ zQy%9pX7VrNoL-`Si5uFQ)7s=@>V@~P;0o@=Yq*7OaR+NO2Odoq)-a>j2*^0a83H^4 z!5YgoIH%W`%Q~X|n{|a?jV{uQ*QodGH8`8rRUPS2Gh8s9Yt718bKw?DS_4W|RE=^o zl_NJl8f5m6ax|-A<&ndei;MhH7ZVX@|L~=ZUmjx^0Kz}Ih7Jls87*{AT+5AUTJH#i zdv4*J7M^)&kDNVjv}_`DqFw2)6!qQk#b)B<5f1@6Vd^6f$GK=_JuM40wk zk7l(5tATLoB|K3gbjXqMlaf_MK-kmbQ7Q=Ep@qg9?hdY834~c~IH!gGj8&o?o+W<) zp>sR##lk+X2j2i;0_lj*c*6-FwygrfR#HlY>fsyO=u{Jc&@3yZOb{}(&?!^RsmA$^ z1H!l2IH!fObGB&3hSk16n3jWku~6O?`T_{ekK&F9zc{T&Dg9X`Kp0C(i7;S|jDOl5 z;SYrS`}(Q`AxjHmS8BMl;{_o=82A|Hw9w&m8x*o8{0R_VCB0ZUHs_Wn5V}9XbBVAq zNsq=)^MYTCL{dtGD}rQv)!vmQK={3FXssZW)53;>L0sp|nQlOMU4e62=yuW?9eHS5 z0fd1{+>3?B?W2AL!XnZUq49=$-HYnL4XsspqC}`1zM-c3-U%SA+gaBn2(4)0`;CjZ zShi&q5N=GxIW6@3*9tZDcDnAH9cD18gB!H1*DV+JHD0i z89CmEfN-wa{67VuH7(4{_2)Fls=|S=Rfcm~IFxIJe)Ft#0m49Y+>3<;woAi-u$Xj2 z=rH^}w0O{P00>6A2yJQM zn@WFflHL6bAiP+Eb6S{FEk~}F^_4(4=Nj(C!aE`7eSq))>4?yH!#5kl(t+?1DJ8<> z;TvXuweJEDzS*+>Z$YS_g=1bX;EpN|M*-p0tvIKJ3I1|4-jM4JgaJvo7Yh$v%FhDA zW27TO;|*=YKZ3u*7o?O3!-j8Yc-5f;!h~pxXM#{k3k!e$nA?`2xB`S;F*v7%ZH{vE uET~ukgoUIR3)%ZMn}JZF!*hvn^8-CfJ^ynjxZ!G2N`!vtGJe3va?XF{W>y;j diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680147471.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680147471.oneflow-32 deleted file mode 100644 index 64998e40b1cffeec73952b8f227b8d6d72674b43..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$RMbyt`jId3{j@g@}|X6EU+mZj#ESQ-6VRAmbQznlyK diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148160.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148160.oneflow-32 deleted file mode 100644 index d00c6968ba3861f9106150eba8d469512c3f3d21..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$Qz5=A#Sa^7^5;!P?_%*@ksElbTSu`-gI!L%O$!5j-+ diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148191.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680148191.oneflow-32 deleted file mode 100644 index 2d3ebe79e641292fff8da172674c944f167e34e5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$Q$t3PjdNT9%quVr6vZr=<=6&{7PM diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680162058.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680162058.oneflow-32 deleted file mode 100644 index 2c464c8d1a6dc4caafc59022090d1b19e7ce37a3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$RyH#9r>bKZ26;!P?_%*@ksElbTSu`&u~_KN@j#DNP6 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163617.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163617.oneflow-32 deleted file mode 100644 index e69de29bb..000000000 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163809.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163809.oneflow-32 deleted file mode 100644 index e69de29bb..000000000 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163941.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680163941.oneflow-32 deleted file mode 100644 index e69de29bb..000000000 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168494.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168494.oneflow-32 deleted file mode 100644 index e69de29bb..000000000 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168780.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168780.oneflow-32 deleted file mode 100644 index e844923f49823098b8e420364a6a4438b40e938f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$P|AE{jq<-F-A#hX-=n3<>NT9%quVr3M2(sCXE)_x4B diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168820.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680168820.oneflow-32 deleted file mode 100644 index 8ed63e9be45058d52f5cd47425c9b11c88dfae62..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3920 zcmZYBc~BE~7{+lf#ViekXw))dAzt;0cr4XbG|PZO5CIjZ(j!F0kroBy(xP@c6&=M3 zacEI$jX)LfsH0U&Jvp?(YOPx3up$-#t5_&S!3o;zHhp*B-}3i&GVdo(Hes;ce}DO( zhiDc4miK;-RMfH_$|b8!v8!}}sWGdPnMs32{}AeF>!K$#3{%wqT>gOX#;7S3rC4T4 zHW`y+SH@^2GJBbSvX|`{YKv>{t0yV;)Y#(wlsiQw$0wWOjB)WvNt&^TNE3!6!^pHP5Wlx?^-n)yr9hA4( z%a)IBD+*VXXF^UMk(W-hB~26m9&GD{_u=q%=%r^wdz_`v8T7=LjlvlYI*q}MphurM%~@|H98}68uZdL+`G(L58JuweKq41hi5>iq+BQ`VT-?xbNc)4 zJHoterG5Id7mDMvpd3J{DVG@Qa!D~u<5w+9(&R0pkUl}P8`h>KBjMuN&`X84d@4^Mp=Azq zqR_9F)4TTgZAHS|U?^q6+%I(o?wh(yBz#z0(JTo2kiyrO4=@|DPvj!u-6N2bLjMe! z&D@gp2nidGLN68OojW-h3Atm?i9&wExpBw(BH@tlp_B<*5_JaOM7x(r=yLVrRzcX8 z6!IIYOOHDsp`i+LQs^E_vjrpj1tVceHS|(pd-sTTBs|GMCkpuuw?7=|jf8iuLn#x+ zSU0>8Wn6@W)u+3k2tsF4$UlebhC}!Z&+$Cuq_8Q0X8qjEiAdOU5qhccyrroN2|df8 z6NT-+a{6MWrW*C-WDHFC@ zpTozWIZi{uO|_NJ1tCQWtAY{P!h2vw5g%ihxT*` z!na6a=Yea?%&f|EESv{9DV(>MX494yEJwl@VbDv3Y{sAXQ)-HUP89MRK5lMZgM`NU zP|Ae$aXLe&`L+fL^VsZ8L8v5!{DwU{!tryc3xJ#yhAp63yO2CLB%Bcly;SJ6vCl5F zp>{TOqLAM(&AHPK2^+*xCakn>n2_RDfrQ?cab1GYl@#*N;nGcm_M;6;UPDd_!$WCy z`KhF%NO<}U^ituv(zHWJxXEt(|7VE853g|g(*7y<7k|DzlrrH)>wj&Z`0mX?!jb#0 zyby$Lq_DUnpNVWa<%Wa~X2?k)zu~9)w;htZpxvj)bqpP82>Xrx&6P z>+|6iWx{pwI>UL5vJDBFa-+Kip^6kbM4Fi~4KJS}VbZsdlR|#O*1&)$B>c?`y;PX4 ze0~`T3-X~8h5Uvsy;=?lQwpJ!3B#=$8Zz4^AfeN!S3QEzofIw)D`0jmm@^j%zZwlW zDZCUyvo&o&2a#~tSm>p~62`>IBz;i@g@}|X6EU+mZj#ESQ!b)YNT9%quVr68zA!sfD$GHpn diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245650.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245650.oneflow-32 deleted file mode 100644 index 247d83caee7d86f8e4ec5bdbdd1dafb243af61c3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$R}g)5#c;JoQ5#hX-=n3<>NT9%quVrBGu_ttX&&kGHZ diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245838.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245838.oneflow-32 deleted file mode 100644 index e69de29bb..000000000 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245977.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1680245977.oneflow-32 deleted file mode 100644 index e69de29bb..000000000 diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt b/projects/MagicPrompt/oneflow_magicprompt/log.txt deleted file mode 100644 index 6d6ef73f9..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/log.txt +++ /dev/null @@ -1,6904 +0,0 @@ -[03/30 11:28:49] libai INFO: Rank of current process: 0. World size: 1 -[03/30 11:28:49] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 11:28:49] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 11:28:49] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 11:28:49] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 11:28:50] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.054 seconds -[03/30 11:28:50] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.055 seconds -[03/30 11:28:52] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008338 seconds -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 11:28:52] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 11:28:52] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 11:28:52] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/30 11:28:52] libai INFO: > Start building model... -[03/30 11:28:52] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 11:28:52] libai INFO: >>> done with building model. Building time: 0.145 seconds -[03/30 11:28:52] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 11:28:53] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 11:29:18] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0014 s/iter lr: 5.00e-05 -[03/30 11:29:44] lb.utils.events INFO: eta: 100 days, 0:25:31 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0010 s/iter lr: 5.00e-05 -[03/30 11:30:10] lb.utils.events INFO: eta: 100 days, 2:37:28 iteration: 2/335008 consumed_samples: 12 total_loss: 9.44 time: 25.8189 s/iter data_time: 0.0011 s/iter total_throughput: 0.15 samples/s lr: 5.00e-05 -[03/30 11:30:35] lb.utils.events INFO: eta: 99 days, 6:11:35 iteration: 3/335008 consumed_samples: 16 total_loss: 9.374 time: 25.5994 s/iter data_time: 0.0009 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:31:01] lb.utils.events INFO: eta: 99 days, 21:40:02 iteration: 4/335008 consumed_samples: 20 total_loss: 9.307 time: 25.6548 s/iter data_time: 0.0010 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:31:27] lb.utils.events INFO: eta: 99 days, 18:03:38 iteration: 5/335008 consumed_samples: 24 total_loss: 9.212 time: 25.6632 s/iter data_time: 0.0009 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:31:53] lb.utils.events INFO: eta: 99 days, 18:24:32 iteration: 6/335008 consumed_samples: 28 total_loss: 9.117 time: 25.6768 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:32:18] lb.utils.events INFO: eta: 99 days, 20:01:26 iteration: 7/335008 consumed_samples: 32 total_loss: 9.093 time: 25.7183 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:32:44] lb.utils.events INFO: eta: 99 days, 18:23:41 iteration: 8/335008 consumed_samples: 36 total_loss: 9.069 time: 25.7046 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:33:10] lb.utils.events INFO: eta: 99 days, 16:24:36 iteration: 9/335008 consumed_samples: 40 total_loss: 8.982 time: 25.6782 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:33:35] lb.utils.events INFO: eta: 99 days, 14:25:31 iteration: 10/335008 consumed_samples: 44 total_loss: 8.896 time: 25.6656 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:34:01] lb.utils.events INFO: eta: 99 days, 16:23:44 iteration: 11/335008 consumed_samples: 48 total_loss: 8.853 time: 25.6755 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:34:27] lb.utils.events INFO: eta: 99 days, 17:39:35 iteration: 12/335008 consumed_samples: 52 total_loss: 8.811 time: 25.6799 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:34:53] lb.utils.events INFO: eta: 99 days, 18:00:21 iteration: 13/335008 consumed_samples: 56 total_loss: 8.789 time: 25.6990 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:35:18] lb.utils.events INFO: eta: 99 days, 18:21:06 iteration: 14/335008 consumed_samples: 60 total_loss: 8.768 time: 25.7027 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:35:44] lb.utils.events INFO: eta: 99 days, 17:59:29 iteration: 15/335008 consumed_samples: 64 total_loss: 8.736 time: 25.7000 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:36:10] lb.utils.events INFO: eta: 99 days, 18:20:15 iteration: 16/335008 consumed_samples: 68 total_loss: 8.704 time: 25.7059 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:36:36] lb.utils.events INFO: eta: 99 days, 19:02:25 iteration: 17/335008 consumed_samples: 72 total_loss: 8.693 time: 25.7103 s/iter data_time: 0.0008 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 11:37:47] libai INFO: Rank of current process: 0. World size: 1 -[03/30 11:37:47] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 11:37:47] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 11:37:47] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 11:37:47] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 11:37:47] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.053 seconds -[03/30 11:37:47] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.054 seconds -[03/30 11:37:49] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007539 seconds -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 11:37:49] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 11:37:49] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 11:37:50] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/30 11:37:50] libai INFO: > Start building model... -[03/30 11:37:50] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 11:37:50] libai INFO: >>> done with building model. Building time: 0.155 seconds -[03/30 11:37:50] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 11:37:51] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 11:37:51] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 11:37:56] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - -[03/30 11:37:56] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) -[03/30 11:49:16] libai INFO: Rank of current process: 0. World size: 1 -[03/30 11:49:16] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 11:49:16] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug=0 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 11:49:16] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 11:49:16] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 11:49:16] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.045 seconds -[03/30 11:49:16] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.046 seconds -[03/30 11:49:18] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008186 seconds -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 11:49:18] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 11:49:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 11:49:19] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/30 11:49:19] libai INFO: > Start building model... -[03/30 11:49:19] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 11:49:19] libai INFO: >>> done with building model. Building time: 0.157 seconds -[03/30 11:49:19] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 11:49:19] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 11:49:19] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 11:49:20] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 11:49:20] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 11:49:25] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - -[03/30 11:49:25] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) -[03/30 11:49:48] libai INFO: Rank of current process: 0. World size: 1 -[03/30 11:49:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 11:49:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug=1 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 11:49:48] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 11:49:48] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 11:49:48] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.044 seconds -[03/30 11:49:48] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.045 seconds -[03/30 11:49:50] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006787 seconds -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 11:49:50] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.003 seconds -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 11:49:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 11:49:50] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/30 11:49:50] libai INFO: > Start building model... -[03/30 11:49:50] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 11:49:50] libai INFO: >>> done with building model. Building time: 0.154 seconds -[03/30 11:49:50] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 11:49:50] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 11:49:50] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 11:49:51] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 11:49:51] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 11:49:56] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - -[03/30 11:49:56] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) -[03/30 15:40:54] libai INFO: Rank of current process: 0. World size: 1 -[03/30 15:40:54] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 15:40:54] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug=1 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 15:40:54] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 15:40:54] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 15:40:54] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.050 seconds -[03/30 15:40:54] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.051 seconds -[03/30 15:40:56] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008802 seconds -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 15:40:56] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 15:40:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 15:40:56] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/30 15:40:56] libai INFO: > Start building model... -[03/30 15:40:56] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 15:40:56] libai INFO: >>> done with building model. Building time: 0.169 seconds -[03/30 15:40:56] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 15:40:56] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 15:40:56] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 15:40:58] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 15:40:58] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 15:41:03] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/30 15:41:03] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) -[03/30 16:06:54] libai INFO: Rank of current process: 0. World size: 1 -[03/30 16:06:54] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 16:06:54] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug=1 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 16:06:54] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 16:06:54] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 16:06:54] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.052 seconds -[03/30 16:06:54] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.053 seconds -[03/30 16:06:56] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006865 seconds -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 16:06:56] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 16:06:56] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 16:06:56] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/30 16:06:56] libai INFO: > Start building model... -[03/30 16:06:56] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 16:06:56] libai INFO: >>> done with building model. Building time: 0.135 seconds -[03/30 16:06:56] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 16:06:57] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 16:07:23] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0028 s/iter lr: 5.00e-05 -[03/30 16:07:48] lb.utils.events INFO: eta: 99 days, 12:07:57 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0016 s/iter lr: 5.00e-05 -[03/30 16:08:14] lb.utils.events INFO: eta: 99 days, 18:10:44 iteration: 2/335008 consumed_samples: 12 total_loss: 9.44 time: 25.7281 s/iter data_time: 0.0013 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 16:10:06] libai INFO: Rank of current process: 0. World size: 1 -[03/30 16:10:06] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 16:10:06] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug=1 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 16:10:06] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 16:10:06] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 16:10:06] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.052 seconds -[03/30 16:10:06] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.053 seconds -[03/30 16:10:08] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007809 seconds -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 16:10:08] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 16:10:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 16:10:08] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/30 16:10:08] libai INFO: > Start building model... -[03/30 16:10:08] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 16:10:08] libai INFO: >>> done with building model. Building time: 0.156 seconds -[03/30 16:10:08] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 16:10:09] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 16:10:35] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0018 s/iter lr: 5.00e-05 -[03/30 16:11:01] lb.utils.events INFO: eta: 99 days, 11:08:04 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0014 s/iter lr: 5.00e-05 -[03/30 16:11:26] lb.utils.events INFO: eta: 99 days, 13:30:23 iteration: 2/335008 consumed_samples: 12 total_loss: 9.44 time: 25.6779 s/iter data_time: 0.0012 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 16:11:52] lb.utils.events INFO: eta: 99 days, 5:53:49 iteration: 3/335008 consumed_samples: 16 total_loss: 9.374 time: 25.5962 s/iter data_time: 0.0010 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 16:12:17] libai INFO: Rank of current process: 0. World size: 1 -[03/30 16:12:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 16:12:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug=1 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=1, - test_micro_batch_size=1, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 16:12:17] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 16:12:17] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 16:12:18] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.051 seconds -[03/30 16:12:18] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.052 seconds -[03/30 16:12:19] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007111 seconds -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 16:12:19] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 16:12:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... -[03/30 16:12:19] lb.data.datasets.gpt_dataset INFO: > last epoch number of samples (977) is smaller than 80% of number of samples per epoch (4511), setting separate_last_epoch to True -[03/30 16:12:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.010740 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.001255 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 9023) and [9023, 13535) ... -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000525 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_doc_idx.npy -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_sample_idx.npy -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_shuffle_idx.npy -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > only one epoch required, setting separate_last_epoch to False -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.003128 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000350 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 4511) and [4511, 4511) ... -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000232 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_doc_idx.npy -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_sample_idx.npy -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_shuffle_idx.npy -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > only one epoch required, setting separate_last_epoch to False -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.003130 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000328 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 4511) and [4511, 4511) ... -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000229 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_doc_idx.npy -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_sample_idx.npy -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_shuffle_idx.npy -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 16:12:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 16:12:20] lb.engine.default INFO: Auto-scaling the config to train.train_iter=446655, train.warmup_iter=0 -[03/30 16:12:20] libai INFO: > Start building model... -[03/30 16:12:20] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 16:12:20] libai INFO: >>> done with building model. Building time: 0.153 seconds -[03/30 16:12:20] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 16:12:20] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 16:12:20] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 16:12:21] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 16:12:21] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 17:28:10] libai INFO: Rank of current process: 0. World size: 1 -[03/30 17:28:10] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 17:28:10] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug=1 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=1, - test_micro_batch_size=1, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 17:28:10] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 17:28:10] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 17:28:10] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.094 seconds -[03/30 17:28:10] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.095 seconds -[03/30 17:28:12] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007207 seconds -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 17:28:12] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_doc_idx.npy -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_sample_idx.npy -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_shuffle_idx.npy -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_doc_idx.npy -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_sample_idx.npy -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_shuffle_idx.npy -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_doc_idx.npy -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_sample_idx.npy -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_shuffle_idx.npy -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 17:28:12] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 17:28:12] lb.engine.default INFO: Auto-scaling the config to train.train_iter=446655, train.warmup_iter=0 -[03/30 17:28:12] libai INFO: > Start building model... -[03/30 17:28:12] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 17:28:12] libai INFO: >>> done with building model. Building time: 0.153 seconds -[03/30 17:28:12] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 17:28:12] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 17:28:12] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 17:28:14] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 17:28:14] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 17:28:21] lb.utils.events INFO: iteration: 0/446655 consumed_samples: 1 total_loss: 11.33 data_time: 0.0009 s/iter lr: 5.00e-05 -[03/30 17:28:27] lb.utils.events INFO: eta: 33 days, 6:54:54 iteration: 1/446655 consumed_samples: 2 total_loss: 11.34 data_time: 0.0016 s/iter lr: 5.00e-05 -[03/30 17:28:34] lb.utils.events INFO: eta: 33 days, 3:04:22 iteration: 2/446655 consumed_samples: 3 total_loss: 11.33 time: 6.4083 s/iter data_time: 0.0029 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:28:40] lb.utils.events INFO: eta: 33 days, 3:14:30 iteration: 3/446655 consumed_samples: 4 total_loss: 11.34 time: 6.4096 s/iter data_time: 0.0024 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:28:46] lb.utils.events INFO: eta: 33 days, 3:19:03 iteration: 4/446655 consumed_samples: 5 total_loss: 11.36 time: 6.4098 s/iter data_time: 0.0021 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:28:53] lb.utils.events INFO: eta: 33 days, 3:20:56 iteration: 5/446655 consumed_samples: 6 total_loss: 11.35 time: 6.4101 s/iter data_time: 0.0020 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:28:59] lb.utils.events INFO: eta: 33 days, 3:18:50 iteration: 6/446655 consumed_samples: 7 total_loss: 11.34 time: 6.4101 s/iter data_time: 0.0020 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:29:06] lb.utils.events INFO: eta: 33 days, 3:17:35 iteration: 7/446655 consumed_samples: 8 total_loss: 11.34 time: 6.4013 s/iter data_time: 0.0018 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:32:57] libai INFO: Rank of current process: 0. World size: 1 -[03/30 17:32:57] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 17:32:57] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=2, - test_micro_batch_size=1, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 17:32:57] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 17:32:57] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 17:32:57] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds -[03/30 17:32:57] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.057 seconds -[03/30 17:32:59] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007043 seconds -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 17:32:59] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > last epoch number of samples (1953) is smaller than 80% of number of samples per epoch (4511), setting separate_last_epoch to True -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.019319 -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.001643 -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 18047) and [18047, 22559) ... -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000756 -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_20000ns_1024sl_1234s_doc_idx.npy -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_20000ns_1024sl_1234s_sample_idx.npy -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_20000ns_1024sl_1234s_shuffle_idx.npy -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of samples: 22560 -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of epochs: 5 -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_doc_idx.npy -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_sample_idx.npy -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_shuffle_idx.npy -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_doc_idx.npy -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_sample_idx.npy -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_shuffle_idx.npy -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 17:32:59] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 17:32:59] lb.engine.default INFO: Auto-scaling the config to train.train_iter=372224, train.warmup_iter=0 -[03/30 17:32:59] libai INFO: > Start building model... -[03/30 17:32:59] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 17:32:59] libai INFO: >>> done with building model. Building time: 0.156 seconds -[03/30 17:32:59] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 17:33:00] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 17:33:00] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 17:33:06] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/30 17:33:06] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) -[03/30 17:33:37] libai INFO: Rank of current process: 0. World size: 1 -[03/30 17:33:37] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 17:33:37] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=1, - test_micro_batch_size=1, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 17:33:37] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 17:33:37] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 17:33:37] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.080 seconds -[03/30 17:33:37] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.080 seconds -[03/30 17:33:39] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006754 seconds -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 17:33:39] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_doc_idx.npy -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_sample_idx.npy -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_10000ns_1024sl_1234s_shuffle_idx.npy -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_doc_idx.npy -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_sample_idx.npy -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_750ns_1024sl_1234s_shuffle_idx.npy -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_doc_idx.npy -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_sample_idx.npy -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_250ns_1024sl_1234s_shuffle_idx.npy -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 17:33:39] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 17:33:39] lb.engine.default INFO: Auto-scaling the config to train.train_iter=446655, train.warmup_iter=0 -[03/30 17:33:39] libai INFO: > Start building model... -[03/30 17:33:39] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 17:33:39] libai INFO: >>> done with building model. Building time: 0.161 seconds -[03/30 17:33:39] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 17:33:40] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 17:33:40] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 17:33:47] lb.utils.events INFO: iteration: 0/446655 consumed_samples: 1 total_loss: 11.33 data_time: 0.0012 s/iter lr: 5.00e-05 -[03/30 17:33:53] lb.utils.events INFO: eta: 33 days, 2:28:50 iteration: 1/446655 consumed_samples: 2 total_loss: 11.34 data_time: 0.0011 s/iter lr: 5.00e-05 -[03/30 17:33:59] lb.utils.events INFO: eta: 33 days, 1:36:38 iteration: 2/446655 consumed_samples: 3 total_loss: 11.33 time: 6.3965 s/iter data_time: 0.0011 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:34:06] lb.utils.events INFO: eta: 33 days, 1:55:18 iteration: 3/446655 consumed_samples: 4 total_loss: 11.34 time: 6.3990 s/iter data_time: 0.0012 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:34:12] lb.utils.events INFO: eta: 33 days, 2:13:58 iteration: 4/446655 consumed_samples: 5 total_loss: 11.36 time: 6.4007 s/iter data_time: 0.0013 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:34:19] lb.utils.events INFO: eta: 33 days, 1:55:05 iteration: 5/446655 consumed_samples: 6 total_loss: 11.35 time: 6.3991 s/iter data_time: 0.0013 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:34:25] lb.utils.events INFO: eta: 33 days, 2:13:45 iteration: 6/446655 consumed_samples: 7 total_loss: 11.34 time: 6.4000 s/iter data_time: 0.0012 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:34:31] lb.utils.events INFO: eta: 33 days, 2:07:00 iteration: 7/446655 consumed_samples: 8 total_loss: 11.34 time: 6.4000 s/iter data_time: 0.0012 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:34:38] lb.utils.events INFO: eta: 33 days, 2:00:16 iteration: 8/446655 consumed_samples: 9 total_loss: 11.34 time: 6.3995 s/iter data_time: 0.0014 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:34:44] lb.utils.events INFO: eta: 33 days, 1:55:06 iteration: 9/446655 consumed_samples: 10 total_loss: 11.34 time: 6.3994 s/iter data_time: 0.0015 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:34:51] lb.utils.events INFO: eta: 33 days, 1:49:55 iteration: 10/446655 consumed_samples: 11 total_loss: 11.34 time: 6.3990 s/iter data_time: 0.0015 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:34:57] lb.utils.events INFO: eta: 33 days, 1:54:06 iteration: 11/446655 consumed_samples: 12 total_loss: 11.34 time: 6.3990 s/iter data_time: 0.0015 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:35:03] lb.utils.events INFO: eta: 33 days, 1:49:42 iteration: 12/446655 consumed_samples: 13 total_loss: 11.33 time: 6.3988 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:35:10] lb.utils.events INFO: eta: 33 days, 1:53:53 iteration: 13/446655 consumed_samples: 14 total_loss: 11.33 time: 6.3991 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:35:16] lb.utils.events INFO: eta: 33 days, 1:55:44 iteration: 14/446655 consumed_samples: 15 total_loss: 11.33 time: 6.3991 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:35:23] lb.utils.events INFO: eta: 33 days, 1:52:31 iteration: 15/446655 consumed_samples: 16 total_loss: 11.33 time: 6.3990 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:35:29] lb.utils.events INFO: eta: 33 days, 1:55:32 iteration: 16/446655 consumed_samples: 17 total_loss: 11.33 time: 6.3992 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:35:35] lb.utils.events INFO: eta: 33 days, 1:52:18 iteration: 17/446655 consumed_samples: 18 total_loss: 11.33 time: 6.3990 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:35:42] lb.utils.events INFO: eta: 33 days, 1:55:19 iteration: 18/446655 consumed_samples: 19 total_loss: 11.33 time: 6.3992 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:35:48] lb.utils.events INFO: eta: 33 days, 1:56:22 iteration: 19/446655 consumed_samples: 20 total_loss: 11.33 time: 6.3993 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:35:55] lb.utils.events INFO: eta: 33 days, 1:55:06 iteration: 20/446655 consumed_samples: 21 total_loss: 11.33 time: 6.3993 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:36:01] lb.utils.events INFO: eta: 33 days, 1:55:02 iteration: 21/446655 consumed_samples: 22 total_loss: 11.33 time: 6.3993 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:36:07] lb.utils.events INFO: eta: 33 days, 1:54:59 iteration: 22/446655 consumed_samples: 23 total_loss: 11.33 time: 6.3993 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:36:14] lb.utils.events INFO: eta: 33 days, 1:54:56 iteration: 23/446655 consumed_samples: 24 total_loss: 11.33 time: 6.3993 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:36:20] lb.utils.events INFO: eta: 33 days, 1:54:46 iteration: 24/446655 consumed_samples: 25 total_loss: 11.33 time: 6.3992 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:36:27] lb.utils.events INFO: eta: 33 days, 1:54:37 iteration: 25/446655 consumed_samples: 26 total_loss: 11.33 time: 6.3990 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:36:33] lb.utils.events INFO: eta: 33 days, 1:54:28 iteration: 26/446655 consumed_samples: 27 total_loss: 11.33 time: 6.3989 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:36:39] lb.utils.events INFO: eta: 33 days, 1:53:04 iteration: 27/446655 consumed_samples: 28 total_loss: 11.33 time: 6.3989 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:36:46] lb.utils.events INFO: eta: 33 days, 1:51:41 iteration: 28/446655 consumed_samples: 29 total_loss: 11.33 time: 6.3988 s/iter data_time: 0.0017 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 17:36:52] lb.utils.events INFO: eta: 33 days, 1:50:08 iteration: 29/446655 consumed_samples: 30 total_loss: 11.33 time: 6.3987 s/iter data_time: 0.0016 s/iter total_throughput: 0.16 samples/s lr: 5.00e-05 -[03/30 18:18:17] libai INFO: Rank of current process: 0. World size: 4 -[03/30 18:18:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 18:18:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 18:18:17] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/30 18:18:17] lb.engine.default INFO: > compiling dataset index builder ... -[03/30 18:18:17] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.088 seconds -[03/30 18:18:17] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.089 seconds -[03/30 18:18:19] lb.engine.default INFO: Prepare training, validating, testing set -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007127 seconds -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 18:18:19] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > last epoch number of samples (2084) is smaller than 80% of number of samples per epoch (4511), setting separate_last_epoch to True -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.142318 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.008270 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 157916) and [157916, 162428) ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.004275 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > last epoch number of samples (2977) is smaller than 80% of number of samples per epoch (4511), setting separate_last_epoch to True -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.010095 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000795 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 9023) and [9023, 13535) ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000490 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > WARNING: could not find index map files, building the indices on rank 0 ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > only one epoch required, setting separate_last_epoch to False -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save doc-idx mapping ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save doc-idx mapping (seconds): 0.003151 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: start to build and save sample-idx mapping ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save sample-idx mapping (seconds): 0.000337 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > building shuffle index with split [0, 4511) and [4511, 4511) ... -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > elapsed time to build and save shuffle-idx mapping (seconds): 0.000237 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 18:18:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 18:18:19] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/30 18:18:20] libai INFO: > Start building model... -[03/30 18:18:25] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 18:18:25] libai INFO: >>> done with building model. Building time: 5.322 seconds -[03/30 18:18:25] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 18:18:25] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 18:18:25] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 18:18:26] lb.engine.trainer INFO: Starting training from iteration 0 -[03/30 18:18:26] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 18:18:35] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/30 18:18:35] lb.engine.hooks INFO: Total training time: 0:00:08 (0:00:00 on hooks) -[03/31 14:31:38] libai INFO: Rank of current process: 0. World size: 1 -[03/31 14:31:38] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:31:38] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:31:38] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/31 14:31:38] lb.engine.default INFO: > compiling dataset index builder ... -[03/31 14:31:38] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.050 seconds -[03/31 14:31:38] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.051 seconds -[03/31 14:31:40] lb.engine.default INFO: Prepare training, validating, testing set -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007057 seconds -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:31:40] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:31:40] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:31:40] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:31:40] libai INFO: > Start building model... -[03/31 14:31:41] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:31:41] libai INFO: >>> done with building model. Building time: 0.162 seconds -[03/31 14:31:41] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:31:41] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:31:41] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:31:42] lb.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:31:42] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/31 14:31:48] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/31 14:31:48] lb.engine.hooks INFO: Total training time: 0:00:05 (0:00:00 on hooks) -[03/31 14:32:17] libai INFO: Rank of current process: 0. World size: 4 -[03/31 14:32:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:32:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:32:17] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/31 14:32:17] lb.engine.default INFO: > compiling dataset index builder ... -[03/31 14:32:17] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.043 seconds -[03/31 14:32:17] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.044 seconds -[03/31 14:32:19] lb.engine.default INFO: Prepare training, validating, testing set -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006932 seconds -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:32:19] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.010 seconds -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:32:19] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:32:20] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:32:20] libai INFO: > Start building model... -[03/31 14:40:48] libai INFO: Rank of current process: 0. World size: 4 -[03/31 14:40:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:40:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:40:48] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/31 14:40:48] lb.engine.default INFO: > compiling dataset index builder ... -[03/31 14:40:48] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.048 seconds -[03/31 14:40:48] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.049 seconds -[03/31 14:40:50] lb.engine.default INFO: Prepare training, validating, testing set -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007409 seconds -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:40:50] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.013 seconds -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:40:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:40:50] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:40:50] libai INFO: > Start building model... -[03/31 14:40:51] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:40:51] libai INFO: >>> done with building model. Building time: 0.843 seconds -[03/31 14:40:51] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:40:51] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:40:51] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:40:52] lb.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:40:52] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/31 14:41:01] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/31 14:41:01] lb.engine.hooks INFO: Total training time: 0:00:08 (0:00:00 on hooks) -[03/31 14:54:05] libai INFO: Rank of current process: 0. World size: 4 -[03/31 14:54:05] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:54:06] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:54:06] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/31 14:54:06] lb.engine.default INFO: > compiling dataset index builder ... -[03/31 14:54:06] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds -[03/31 14:54:06] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.058 seconds -[03/31 14:54:08] lb.engine.default INFO: Prepare training, validating, testing set -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.009007 seconds -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:54:08] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.013 seconds -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:54:08] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:54:08] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:54:08] libai INFO: > Start building model... -[03/31 14:54:09] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:54:09] libai INFO: >>> done with building model. Building time: 0.872 seconds -[03/31 14:54:09] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:54:09] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:54:09] lb.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:54:10] lb.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:54:10] lb.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/31 14:54:19] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/31 14:54:19] lb.engine.hooks INFO: Total training time: 0:00:09 (0:00:00 on hooks) -[03/31 14:57:13] libai INFO: Rank of current process: 0. World size: 4 -[03/31 14:57:13] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:57:14] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:57:14] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/31 14:57:14] lb.engine.default INFO: > compiling dataset index builder ... -[03/31 14:57:14] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds -[03/31 14:57:14] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.058 seconds -[03/31 14:57:16] lb.engine.default INFO: Prepare training, validating, testing set -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008085 seconds -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:57:16] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.002 seconds -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.002 seconds -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:57:16] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:57:16] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:57:16] libai INFO: > Start building model... -[03/31 14:57:17] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:57:17] libai INFO: >>> done with building model. Building time: 0.885 seconds -[03/31 14:57:17] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:57:18] lb.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:57:47] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 16 total_loss: 10.86 data_time: 0.0064 s/iter lr: 5.00e-05 -[03/31 14:59:32] libai INFO: Rank of current process: 0. World size: 4 -[03/31 14:59:32] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:59:32] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:59:33] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[03/31 14:59:33] lb.engine.default INFO: > compiling dataset index builder ... -[03/31 14:59:33] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.088 seconds -[03/31 14:59:33] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.090 seconds -[03/31 14:59:35] lb.engine.default INFO: Prepare training, validating, testing set -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008213 seconds -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:59:35] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.011 seconds -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:59:35] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:59:35] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:59:35] libai INFO: > Start building model... -[03/31 14:59:36] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:59:36] libai INFO: >>> done with building model. Building time: 0.831 seconds -[03/31 14:59:36] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:59:37] lb.engine.trainer INFO: Starting training from iteration 0 -[03/31 15:00:06] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 16 total_loss: 10.86 data_time: 0.0058 s/iter lr: 5.00e-05 diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank1 b/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank1 deleted file mode 100644 index dd0d51b6e..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank1 +++ /dev/null @@ -1,2084 +0,0 @@ -[03/30 18:18:17] libai INFO: Rank of current process: 1. World size: 4 -[03/30 18:18:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 18:18:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 18:18:19] libai.engine.default INFO: Prepare training, validating, testing set -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007530 seconds -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.008 seconds -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 18:18:20] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/30 18:18:20] libai INFO: > Start building model... -[03/30 18:18:25] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 18:18:25] libai INFO: >>> done with building model. Building time: 5.321 seconds -[03/30 18:18:25] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 18:18:25] libai.engine.trainer INFO: Starting training from iteration 0 -[03/30 18:18:26] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 18:18:35] libai.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/30 18:18:35] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) -[03/31 14:32:17] libai INFO: Rank of current process: 1. World size: 4 -[03/31 14:32:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:32:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:32:19] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008707 seconds -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.002 seconds -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:32:20] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:32:20] libai INFO: > Start building model... -[03/31 14:40:48] libai INFO: Rank of current process: 1. World size: 4 -[03/31 14:40:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:40:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:40:50] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007467 seconds -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:40:50] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:40:50] libai INFO: > Start building model... -[03/31 14:40:51] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:40:51] libai INFO: >>> done with building model. Building time: 0.843 seconds -[03/31 14:40:51] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:40:51] libai.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:40:52] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/31 14:41:01] libai.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/31 14:41:01] libai.engine.hooks INFO: Total training time: 0:00:09 (0:00:00 on hooks) -[03/31 14:54:05] libai INFO: Rank of current process: 1. World size: 4 -[03/31 14:54:05] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:54:06] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:54:08] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007189 seconds -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.013 seconds -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:54:08] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:54:08] libai INFO: > Start building model... -[03/31 14:54:09] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:54:09] libai INFO: >>> done with building model. Building time: 0.872 seconds -[03/31 14:54:09] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:54:09] libai.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:54:10] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/31 14:54:20] libai.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/31 14:54:20] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) -[03/31 14:57:13] libai INFO: Rank of current process: 1. World size: 4 -[03/31 14:57:13] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:57:14] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:57:16] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008388 seconds -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.015 seconds -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:57:16] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:57:16] libai INFO: > Start building model... -[03/31 14:57:17] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:57:17] libai INFO: >>> done with building model. Building time: 0.885 seconds -[03/31 14:57:17] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:57:17] libai.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:59:32] libai INFO: Rank of current process: 1. World size: 4 -[03/31 14:59:32] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:59:32] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:59:35] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007985 seconds -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.008 seconds -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.002 seconds -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:59:35] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:59:35] libai INFO: > Start building model... -[03/31 14:59:36] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:59:36] libai INFO: >>> done with building model. Building time: 0.831 seconds -[03/31 14:59:36] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:59:36] libai.engine.trainer INFO: Starting training from iteration 0 diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank2 b/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank2 deleted file mode 100644 index 042069405..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank2 +++ /dev/null @@ -1,2084 +0,0 @@ -[03/30 18:18:17] libai INFO: Rank of current process: 2. World size: 4 -[03/30 18:18:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 18:18:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 18:18:19] libai.engine.default INFO: Prepare training, validating, testing set -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008698 seconds -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.014 seconds -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 18:18:19] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/30 18:18:20] libai INFO: > Start building model... -[03/30 18:18:25] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 18:18:25] libai INFO: >>> done with building model. Building time: 5.322 seconds -[03/30 18:18:25] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 18:18:25] libai.engine.trainer INFO: Starting training from iteration 0 -[03/30 18:18:26] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 18:18:35] libai.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/30 18:18:35] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) -[03/31 14:32:17] libai INFO: Rank of current process: 2. World size: 4 -[03/31 14:32:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:32:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:32:19] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006944 seconds -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.014 seconds -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:32:20] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:32:20] libai INFO: > Start building model... -[03/31 14:40:48] libai INFO: Rank of current process: 2. World size: 4 -[03/31 14:40:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:40:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:40:50] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007936 seconds -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:40:50] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:40:50] libai INFO: > Start building model... -[03/31 14:40:51] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:40:51] libai INFO: >>> done with building model. Building time: 0.843 seconds -[03/31 14:40:51] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:40:51] libai.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:40:52] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/31 14:41:01] libai.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/31 14:41:01] libai.engine.hooks INFO: Total training time: 0:00:09 (0:00:00 on hooks) -[03/31 14:54:05] libai INFO: Rank of current process: 2. World size: 4 -[03/31 14:54:05] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:54:06] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:54:08] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007197 seconds -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.010 seconds -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:54:08] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:54:08] libai INFO: > Start building model... -[03/31 14:54:09] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:54:09] libai INFO: >>> done with building model. Building time: 0.872 seconds -[03/31 14:54:09] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:54:09] libai.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:54:10] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/31 14:54:19] libai.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/31 14:54:19] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) -[03/31 14:57:13] libai INFO: Rank of current process: 2. World size: 4 -[03/31 14:57:13] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:57:14] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:57:16] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008128 seconds -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.012 seconds -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:57:16] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:57:16] libai INFO: > Start building model... -[03/31 14:57:17] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:57:17] libai INFO: >>> done with building model. Building time: 0.885 seconds -[03/31 14:57:17] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:57:17] libai.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:59:32] libai INFO: Rank of current process: 2. World size: 4 -[03/31 14:59:32] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:59:32] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:59:35] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007110 seconds -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.006 seconds -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:59:35] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:59:35] libai INFO: > Start building model... -[03/31 14:59:36] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:59:36] libai INFO: >>> done with building model. Building time: 0.831 seconds -[03/31 14:59:36] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:59:36] libai.engine.trainer INFO: Starting training from iteration 0 diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank3 b/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank3 deleted file mode 100644 index 75fd65b0c..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/log.txt.rank3 +++ /dev/null @@ -1,2084 +0,0 @@ -[03/30 18:18:17] libai INFO: Rank of current process: 3. World size: 4 -[03/30 18:18:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/30 18:18:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/30 18:18:19] libai.engine.default INFO: Prepare training, validating, testing set -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007484 seconds -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/30 18:18:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.014 seconds -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/30 18:18:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/30 18:18:19] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/30 18:18:20] libai INFO: > Start building model... -[03/30 18:18:25] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/30 18:18:25] libai INFO: >>> done with building model. Building time: 5.321 seconds -[03/30 18:18:25] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 18:18:25] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/30 18:18:25] libai.engine.trainer INFO: Starting training from iteration 0 -[03/30 18:18:26] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/30 18:18:35] libai.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/30 18:18:35] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) -[03/31 14:32:17] libai INFO: Rank of current process: 3. World size: 4 -[03/31 14:32:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:32:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:32:19] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006762 seconds -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:32:19] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.011 seconds -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:32:19] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:32:20] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:32:20] libai INFO: > Start building model... -[03/31 14:40:48] libai INFO: Rank of current process: 3. World size: 4 -[03/31 14:40:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:40:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:40:50] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008170 seconds -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:40:50] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.011 seconds -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:40:50] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:40:50] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:40:50] libai INFO: > Start building model... -[03/31 14:40:51] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:40:51] libai INFO: >>> done with building model. Building time: 0.843 seconds -[03/31 14:40:51] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:40:51] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:40:51] libai.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:40:52] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/31 14:41:01] libai.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/31 14:41:01] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) -[03/31 14:54:05] libai INFO: Rank of current process: 3. World size: 4 -[03/31 14:54:05] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:54:06] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=True -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:54:08] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006892 seconds -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:54:08] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.005 seconds -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:54:08] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:54:08] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:54:08] libai INFO: > Start building model... -[03/31 14:54:09] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:54:09] libai INFO: >>> done with building model. Building time: 0.872 seconds -[03/31 14:54:09] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:54:09] libai.engine.default INFO: Graph debug mode on, automatically output debug info. -[03/31 14:54:09] libai.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:54:10] libai.models.utils.graph_base INFO: Start compiling the train graph which may take some time. Please wait for a moment ... -[03/31 14:54:19] libai.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 339, in run_step - loss_dict = self.graph(**data) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 243, in __call__ - self._compile(*args, **kwargs) - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 809, in _compile - self.finish_compile_and_init_runtime() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/graph/graph.py", line 1166, in finish_compile_and_init_runtime - self._c_nn_graph.init_runtime() -oneflow._oneflow_internal.exception.RuntimeError: Error: MluDevice::Alloc error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in Allocate - device->Alloc(options, &ptr, size) -Error Type: oneflow.ErrorProto.runtime_error - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/memory/memory_allocator.cpp", line 50, in operator() - -Error Type: oneflow.ErrorProto.runtime_error - -[03/31 14:54:19] libai.engine.hooks INFO: Total training time: 0:00:10 (0:00:00 on hooks) -[03/31 14:57:13] libai INFO: Rank of current process: 3. World size: 4 -[03/31 14:57:13] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:57:13] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:57:16] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.006980 seconds -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:57:16] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.014 seconds -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:57:16] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:57:16] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:57:16] libai INFO: > Start building model... -[03/31 14:57:17] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:57:17] libai INFO: >>> done with building model. Building time: 0.885 seconds -[03/31 14:57:17] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:57:17] libai.engine.trainer INFO: Starting training from iteration 0 -[03/31 14:59:32] libai INFO: Rank of current process: 3. World size: 4 -[03/31 14:59:32] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[03/31 14:59:32] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[03/31 14:59:35] libai.engine.default INFO: Prepare training, validating, testing set -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: building dataset index ... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading sizes... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading pointers... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: reading document index... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007580 seconds -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[03/31 14:59:35] libai.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_160000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.009 seconds -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 162429 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 36 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_12000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 13536 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 3 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_doc_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_sample_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_4000ns_1024sl_1234s_shuffle_idx.npy -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[03/31 14:59:35] libai.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[03/31 14:59:35] libai.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[03/31 14:59:35] libai INFO: > Start building model... -[03/31 14:59:36] libai.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[03/31 14:59:36] libai INFO: >>> done with building model. Building time: 0.831 seconds -[03/31 14:59:36] libai.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[03/31 14:59:36] libai.engine.trainer INFO: Starting training from iteration 0 diff --git a/projects/MagicPrompt/oneflow_magicprompt/metrics.json b/projects/MagicPrompt/oneflow_magicprompt/metrics.json deleted file mode 100644 index 3ab61a6d4..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/metrics.json +++ /dev/null @@ -1,65 +0,0 @@ -{"data_time": 0.0014411650045076385, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} -{"data_time": 0.0009524804991087876, "iteration": 1, "lr": 5e-05, "total_loss": 10.083935260772705} -{"data_time": 0.001286156999412924, "eta_seconds": 8649448.917897075, "iteration": 2, "lr": 5e-05, "time": 25.818865144989104, "total_loss": 9.440001487731934} -{"data_time": 0.0009059479998541065, "eta_seconds": 8575895.740720144, "iteration": 3, "lr": 5e-05, "time": 25.5993831139931, "total_loss": 9.373700618743896} -{"data_time": 0.0010666789894457906, "eta_seconds": 8631602.345258702, "iteration": 4, "lr": 5e-05, "time": 25.765746411998407, "total_loss": 9.30739974975586} -{"data_time": 0.0007962089948705398, "eta_seconds": 8618618.126283506, "iteration": 5, "lr": 5e-05, "time": 25.727064692997374, "total_loss": 9.212128639221191} -{"data_time": 0.0005257390002952889, "eta_seconds": 8619872.805872891, "iteration": 6, "lr": 5e-05, "time": 25.730886791003286, "total_loss": 9.116857528686523} -{"data_time": 0.000506085496454034, "eta_seconds": 8625686.061502784, "iteration": 7, "lr": 5e-05, "time": 25.748316601500846, "total_loss": 9.09308910369873} -{"data_time": 0.0005257390002952889, "eta_seconds": 8619821.34409931, "iteration": 8, "lr": 5e-05, "time": 25.730886791003286, "total_loss": 9.069320678710938} -{"data_time": 0.0006622090004384518, "eta_seconds": 8612676.266367672, "iteration": 9, "lr": 5e-05, "time": 25.709634882499813, "total_loss": 8.98242473602295} -{"data_time": 0.0007986790005816147, "eta_seconds": 8605531.231139852, "iteration": 10, "lr": 5e-05, "time": 25.68838297399634, "total_loss": 8.895528793334961} -{"data_time": 0.0007918894989416003, "eta_seconds": 8612624.847097907, "iteration": 11, "lr": 5e-05, "time": 25.709634882499813, "total_loss": 8.853058815002441} -{"data_time": 0.000785099997301586, "eta_seconds": 8617175.898946738, "iteration": 12, "lr": 5e-05, "time": 25.72329706099117, "total_loss": 8.810588836669922} -{"data_time": 0.0007462590001523495, "eta_seconds": 8618421.432657516, "iteration": 13, "lr": 5e-05, "time": 25.72709192599723, "total_loss": 8.789298057556152} -{"data_time": 0.0007268470071721822, "eta_seconds": 8619666.958778564, "iteration": 14, "lr": 5e-05, "time": 25.730886791003286, "total_loss": 8.768007278442383} -{"data_time": 0.0007467715040547773, "eta_seconds": 8618369.978473663, "iteration": 15, "lr": 5e-05, "time": 25.72709192599723, "total_loss": 8.735990524291992} -{"data_time": 0.0007666960009373724, "eta_seconds": 8619615.497004982, "iteration": 16, "lr": 5e-05, "time": 25.730886791003286, "total_loss": 8.703973770141602} -{"data_time": 0.0007619214957230724, "eta_seconds": 8622145.699786797, "iteration": 17, "lr": 5e-05, "time": 25.73851667150302, "total_loss": 8.69331693649292} -{"data_time": 0.002807431999826804, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} -{"data_time": 0.0016498550030519255, "iteration": 1, "lr": 5e-05, "total_loss": 10.083935260772705} -{"data_time": 0.0005120069981785491, "eta_seconds": 8619044.69123013, "iteration": 2, "lr": 5e-05, "time": 25.72810761400615, "total_loss": 9.440001487731934} -{"data_time": 0.00180548700154759, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} -{"data_time": 0.0014332204955280758, "iteration": 1, "lr": 5e-05, "total_loss": 10.083935260772705} -{"data_time": 0.0010609539895085618, "eta_seconds": 8602223.053338168, "iteration": 2, "lr": 5e-05, "time": 25.67789451900171, "total_loss": 9.440001487731934} -{"data_time": 0.0009005849933600985, "eta_seconds": 8574829.753975058, "iteration": 3, "lr": 5e-05, "time": 25.59620110200194, "total_loss": 9.373700618743896} -{"data_time": 0.0009029859938891605, "iteration": 0, "lr": 5e-05, "total_loss": 11.327921867370605} -{"data_time": 0.0016307824989780784, "iteration": 1, "lr": 5e-05, "total_loss": 11.342677593231201} -{"data_time": 0.0023585790040669963, "eta_seconds": 2862262.2889005477, "iteration": 2, "lr": 5e-05, "time": 6.408260321011767, "total_loss": 11.329483985900879} -{"data_time": 0.0017029235023073852, "eta_seconds": 2862870.7064581364, "iteration": 3, "lr": 5e-05, "time": 6.40963684500457, "total_loss": 11.343458652496338} -{"data_time": 0.0010472680005477741, "eta_seconds": 2863143.539721141, "iteration": 4, "lr": 5e-05, "time": 6.41026203900401, "total_loss": 11.35576343536377} -{"data_time": 0.0011308644970995374, "eta_seconds": 2863256.0381426434, "iteration": 5, "lr": 5e-05, "time": 6.41052826300438, "total_loss": 11.345033168792725} -{"data_time": 0.0012144609936513007, "eta_seconds": 2863130.7191970632, "iteration": 6, "lr": 5e-05, "time": 6.41026203900401, "total_loss": 11.338150978088379} -{"data_time": 0.0011308644970995374, "eta_seconds": 2863055.6510274047, "iteration": 7, "lr": 5e-05, "time": 6.4101083205023315, "total_loss": 11.340061664581299} -{"data_time": 0.001176661011413671, "iteration": 0, "lr": 5e-05, "total_loss": 11.327921867370605} -{"data_time": 0.0011061955010518432, "iteration": 1, "lr": 5e-05, "total_loss": 11.342677593231201} -{"data_time": 0.0011234899866394699, "eta_seconds": 2856998.9962181174, "iteration": 2, "lr": 5e-05, "time": 6.396476442998392, "total_loss": 11.329483985900879} -{"data_time": 0.0011500754990265705, "eta_seconds": 2858118.5685008415, "iteration": 3, "lr": 5e-05, "time": 6.398997356998734, "total_loss": 11.343458652496338} -{"data_time": 0.001176661011413671, "eta_seconds": 2859238.1357417377, "iteration": 4, "lr": 5e-05, "time": 6.401518270999077, "total_loss": 11.35576343536377} -{"data_time": 0.0012264845063327812, "eta_seconds": 2858105.7705061277, "iteration": 5, "lr": 5e-05, "time": 6.398997356998734, "total_loss": 11.345033168792725} -{"data_time": 0.001176661011413671, "eta_seconds": 2859225.3327051955, "iteration": 6, "lr": 5e-05, "time": 6.401518270999077, "total_loss": 11.338150978088379} -{"data_time": 0.0011500754990265705, "eta_seconds": 2858820.9874694482, "iteration": 7, "lr": 5e-05, "time": 6.400627312999859, "total_loss": 11.340061664581299} -{"data_time": 0.001176661011413671, "eta_seconds": 2858416.6440156163, "iteration": 8, "lr": 5e-05, "time": 6.399736355000641, "total_loss": 11.338150978088379} -{"data_time": 0.0012264845063327812, "eta_seconds": 2858106.114542271, "iteration": 9, "lr": 5e-05, "time": 6.399055434500042, "total_loss": 11.340061664581299} -{"data_time": 0.0012763080012518913, "eta_seconds": 2857795.586430767, "iteration": 10, "lr": 5e-05, "time": 6.398374513999443, "total_loss": 11.338150978088379} -{"data_time": 0.001407306503097061, "eta_seconds": 2858046.0265380414, "iteration": 11, "lr": 5e-05, "time": 6.398949555994477, "total_loss": 11.33622694015503} -{"data_time": 0.0015383050049422309, "eta_seconds": 2857782.789681739, "iteration": 12, "lr": 5e-05, "time": 6.398374513999443, "total_loss": 11.33430290222168} -{"data_time": 0.001407306503097061, "eta_seconds": 2858033.2286389293, "iteration": 13, "lr": 5e-05, "time": 6.398949555994477, "total_loss": 11.332839012145996} -{"data_time": 0.0012763080012518913, "eta_seconds": 2858144.8762013507, "iteration": 14, "lr": 5e-05, "time": 6.399213855009293, "total_loss": 11.331375122070312} -{"data_time": 0.0012507934952736832, "eta_seconds": 2857951.0357728465, "iteration": 15, "lr": 5e-05, "time": 6.398794184504368, "total_loss": 11.332839012145996} -{"data_time": 0.0012308660079725087, "eta_seconds": 2858132.077773641, "iteration": 16, "lr": 5e-05, "time": 6.399213855009293, "total_loss": 11.331375122070312} -{"data_time": 0.0012535870046122, "eta_seconds": 2857938.238184477, "iteration": 17, "lr": 5e-05, "time": 6.398794184504368, "total_loss": 11.330994129180908} -{"data_time": 0.0012763080012518913, "eta_seconds": 2858119.2793459306, "iteration": 18, "lr": 5e-05, "time": 6.399213855009293, "total_loss": 11.330613136291504} -{"data_time": 0.0012535870046122, "eta_seconds": 2858182.2744775605, "iteration": 19, "lr": 5e-05, "time": 6.399369226499402, "total_loss": 11.330994129180908} -{"data_time": 0.0012953120021848008, "eta_seconds": 2858106.4809182207, "iteration": 20, "lr": 5e-05, "time": 6.399213855009293, "total_loss": 11.330994129180908} -{"data_time": 0.0013658775060321204, "eta_seconds": 2858102.8979477035, "iteration": 21, "lr": 5e-05, "time": 6.399220160506957, "total_loss": 11.330048561096191} -{"data_time": 0.0014778720069443807, "eta_seconds": 2858099.3149645757, "iteration": 22, "lr": 5e-05, "time": 6.399375531997066, "total_loss": 11.326469898223877} -{"data_time": 0.0013658775060321204, "eta_seconds": 2858096.160288107, "iteration": 23, "lr": 5e-05, "time": 6.399233730502601, "total_loss": 11.321541786193848} -{"data_time": 0.0013658775060321204, "eta_seconds": 2858086.516511644, "iteration": 24, "lr": 5e-05, "time": 6.399220160506957, "total_loss": 11.321541786193848} -{"data_time": 0.0013658775060321204, "eta_seconds": 2858077.3010670617, "iteration": 25, "lr": 5e-05, "time": 6.399220160506957, "total_loss": 11.320509910583496} -{"data_time": 0.0015193385042948648, "eta_seconds": 2858068.0856350907, "iteration": 26, "lr": 5e-05, "time": 6.398848810007621, "total_loss": 11.320509910583496} -{"data_time": 0.001564129997859709, "eta_seconds": 2857984.682117961, "iteration": 27, "lr": 5e-05, "time": 6.3986763970024185, "total_loss": 11.316842079162598} -{"data_time": 0.001564129997859709, "eta_seconds": 2857901.2789456574, "iteration": 28, "lr": 5e-05, "time": 6.3986763970024185, "total_loss": 11.313267707824707} -{"data_time": 0.0014622305025113747, "eta_seconds": 2857808.845811205, "iteration": 29, "lr": 5e-05, "time": 6.3986763970024185, "total_loss": 11.31224012374878} -{"data_time": 0.006361130013829097, "iteration": 0, "lr": 5e-05, "total_loss": 10.858524322509766} -{"data_time": 0.005785448011010885, "iteration": 0, "lr": 5e-05, "total_loss": 10.858524322509766} From a3c8a5e157bdbe531880e6a00e144f0585120b89 Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Mon, 10 Apr 2023 03:16:35 +0000 Subject: [PATCH 12/25] mock tokenzier --- .../mock_transformers/mock_tokenization.py | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 projects/mock_transformers/mock_tokenization.py diff --git a/projects/mock_transformers/mock_tokenization.py b/projects/mock_transformers/mock_tokenization.py new file mode 100644 index 000000000..2b132067f --- /dev/null +++ b/projects/mock_transformers/mock_tokenization.py @@ -0,0 +1,109 @@ +import oneflow as flow +from libai.utils import distributed as dist +flow.mock_torch.enable() + +from transformers import BertTokenizer, GPT2Tokenizer, T5Tokenizer, MT5Tokenizer +from transformers.utils import generic +from transformers.utils.generic import TensorType +from transformers.tokenization_utils_base import * + + +# ---------------- mock TensorType ------------------ +class TensorType(ExplicitEnum): + PYTORCH = "pt" + TENSORFLOW = "tf" + ONEFLOW = "of" + NUMPY = "np" + JAX = "jax" + +generic.TensorType = TensorType + + +# ---------------- mock convert_to_tensors ------------------ +def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): + if tensor_type is None: + return self + + # Convert to TensorType + if not isinstance(tensor_type, TensorType): + tensor_type = TensorType(tensor_type) + as_tensor = None + is_tensor = None + # Get a function reference for the correct framework + if tensor_type == TensorType.TENSORFLOW: + if not is_tf_available(): + raise ImportError( + "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed." + ) + import tensorflow as tf + + as_tensor = tf.constant + is_tensor = tf.is_tensor + elif tensor_type == TensorType.PYTORCH: + if not is_torch_available(): + raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") + import torch + + as_tensor = torch.tensor + is_tensor = torch.is_tensor + elif tensor_type == TensorType.ONEFLOW: + try: + import oneflow + except: + raise ImportError("Unable to convert output to OneFlow tensors format, OneFlow is not installed.") + as_tensor = flow.tensor + is_tensor = flow.is_tensor + elif tensor_type == TensorType.JAX: + if not is_flax_available(): + raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.") + import jax.numpy as jnp # noqa: F811 + + as_tensor = jnp.array + is_tensor = is_jax_tensor + else: + as_tensor = np.asarray + is_tensor = is_numpy_array + + # Do the tensor conversion in batch + for key, value in self.items(): + try: + if prepend_batch_axis: + value = [value] + + if not is_tensor(value): + tensor = as_tensor(value) + + # Removing this for now in favor of controlling the shape with `prepend_batch_axis` + # # at-least2d + # if tensor.ndim > 2: + # tensor = tensor.squeeze(0) + # elif tensor.ndim < 2: + # tensor = tensor[None, :] + + self[key] = tensor + except Exception as e: + if key == "overflowing_tokens": + raise ValueError( + "Unable to create tensor returning overflowing tokens of different lengths. " + "Please see if a fast version of this tokenizer is available to have this feature available." + ) from e + raise ValueError( + "Unable to create tensor, you should probably activate truncation and/or padding with" + " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your" + f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is" + " expected)." + ) from e + if os.getenv("IS_GLOBAL", True) == True: + size = self['input_ids'].size() + sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]) + placement = flow.placement("cuda", list(range(dist.get_world_size()))) + + for k, v in self.items(): + if is_tensor != flow.is_tensor: + raise ValueError("Unable to create tensor, you should probably set `return_tensors='of'` ") + if v.size() != size: + raise ValueError("Unable to create tensor, you should probably padding with `padding=True` ") + self[k] = v.to_global(sbp=sbp, placement=placement) + return self + +BatchEncoding.convert_to_tensors = flow_convert_to_tensors \ No newline at end of file From 846fba80ef16477cb0216a3e18ad73afdaf0440d Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Mon, 10 Apr 2023 07:05:32 +0000 Subject: [PATCH 13/25] compatible with tokenizer of hugginface --- libai/engine/default.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libai/engine/default.py b/libai/engine/default.py index e90c9760e..14a107167 100644 --- a/libai/engine/default.py +++ b/libai/engine/default.py @@ -539,7 +539,8 @@ def build_tokenizer(cls, cfg): cfg.tokenization.make_vocab_size_divisible_by * cfg.train.dist.tensor_parallel_size ) - cfg.model.cfg.vocab_size = tokenizer.padded_vocab_size(multiple) + if hasattr(tokenizer, "padded_vocab_size"): + cfg.model.cfg.vocab_size = tokenizer.padded_vocab_size(multiple) return tokenizer @classmethod From 1d2ac19acdafba4ced9dd155f8a393de3c789c8b Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Mon, 10 Apr 2023 08:08:31 +0000 Subject: [PATCH 14/25] replace mock_tokenizer to pipeline demo --- projects/MagicPrompt/configs/gpt2_inference.py | 5 ++--- projects/MagicPrompt/pipeline.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/projects/MagicPrompt/configs/gpt2_inference.py b/projects/MagicPrompt/configs/gpt2_inference.py index a78b51f78..87bd99eec 100644 --- a/projects/MagicPrompt/configs/gpt2_inference.py +++ b/projects/MagicPrompt/configs/gpt2_inference.py @@ -1,6 +1,6 @@ from configs.common.models.gpt import cfg from libai.config import LazyCall -from libai.tokenizer.tokenization_gpt2 import GPT2Tokenizer +from projects.mock_transformers import mock_tokenization from projects.MagicPrompt.gpt2 import GPTModel, GPTForPreTraining from configs.common.data.gpt_dataset import tokenization from configs.common.train import train @@ -62,8 +62,7 @@ model = LazyCall(GPTModel)(cfg=cfg) pretrain_model = LazyCall(GPTForPreTraining)(cfg=cfg) -tokenization.tokenizer = LazyCall(GPT2Tokenizer)( +tokenization.tokenizer = LazyCall(mock_tokenization.GPT2Tokenizer)( vocab_file="/data/home/magicprompt/vocab.json", merges_file="/data/home/magicprompt/merges.txt", - add_bos_token=True, ) diff --git a/projects/MagicPrompt/pipeline.py b/projects/MagicPrompt/pipeline.py index 1da8e5b4e..57d0a6afa 100644 --- a/projects/MagicPrompt/pipeline.py +++ b/projects/MagicPrompt/pipeline.py @@ -67,10 +67,9 @@ def _parse_parameters(self, **pipeline_parameters): def preprocess(self, inputs, **kwargs) -> dict: # tokenizer encoder - input_ids = self.tokenizer.encode(inputs, return_tensors="of", is_global=True) - + inputs = self.tokenizer(inputs, return_tensors="of", padding=True) inputs = { - "input_ids": input_ids, + "input_ids": inputs.input_ids, } return inputs From e69b70c3a649bda884ed4423574b87c8a4b9f83f Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Mon, 10 Apr 2023 12:25:16 +0000 Subject: [PATCH 15/25] support data parallel inference --- .../tutorials/advanced_tutorials/customize_dataloader.md | 8 ++++---- libai/inference/basic.py | 3 --- libai/inference/generator/generation_utils.py | 5 ++++- projects/MagicPrompt/pipeline.py | 6 ++++-- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/source/tutorials/advanced_tutorials/customize_dataloader.md b/docs/source/tutorials/advanced_tutorials/customize_dataloader.md index 911fece7c..18ae2c3a6 100644 --- a/docs/source/tutorials/advanced_tutorials/customize_dataloader.md +++ b/docs/source/tutorials/advanced_tutorials/customize_dataloader.md @@ -57,15 +57,15 @@ def get_batch(cls, data, mixup_func=None): imgs, labels = data dist.synchronize() - imgs = imgs.to_global(spb=flow.sbp.broadcast, placement=flow.env.all_device_placement("cuda")) + imgs = imgs.to_global(sbp=flow.sbp.broadcast, placement=flow.env.all_device_placement("cuda")) imgs = imgs.to_global( - spb=dist.get_nd_sbp([flow.sbp.split(0), + sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast]), placement=dist.get_layer_placement(0)) - labels = labels.to_global(spb=flow.sbp.broadcast, placement=flow.env.all_device_placement("cuda")) + labels = labels.to_global(sbp=flow.sbp.broadcast, placement=flow.env.all_device_placement("cuda")) labels = labels.to_global( - spb=dist.get_nd_sbp([flow.sbp.split(0), + sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast]), placement=dist.get_layer_placement(-1)) return { diff --git a/libai/inference/basic.py b/libai/inference/basic.py index 4af16fd47..94d3f1781 100644 --- a/libai/inference/basic.py +++ b/libai/inference/basic.py @@ -61,9 +61,6 @@ def __init__( pipeline_num_layers, ) dist.setup_dist_util(self.cfg.train.dist) - assert ( - self.cfg.train.dist.data_parallel_size == 1 - ), "not support data parallel yet, only support tensor and pipeline parallel" logger.info(self.cfg.train.dist) # initial and load model diff --git a/libai/inference/generator/generation_utils.py b/libai/inference/generator/generation_utils.py index 73df306b6..d39182cb6 100644 --- a/libai/inference/generator/generation_utils.py +++ b/libai/inference/generator/generation_utils.py @@ -478,6 +478,9 @@ def greedy_search( unfinished_sequences = flow.ones(input_ids.shape[0]) cur_len = input_ids.shape[-1] while True: + if input_ids.size(0) > 1: + input_ids = input_ids.to_global(sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast])) + # prepare model inputs model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) @@ -496,7 +499,7 @@ def greedy_search( next_tokens = flow.argmax(next_token_scores, dim=-1) next_tokens = next_tokens.to_global(placement=input_ids.placement) unfinished_sequences = unfinished_sequences.to_global( - sbp=next_tokens.sbp, placement=next_tokens.placement + sbp=flow.sbp.broadcast, placement=next_tokens.placement ) if eos_token_id is not None: diff --git a/projects/MagicPrompt/pipeline.py b/projects/MagicPrompt/pipeline.py index 57d0a6afa..ebc6c2167 100644 --- a/projects/MagicPrompt/pipeline.py +++ b/projects/MagicPrompt/pipeline.py @@ -67,6 +67,8 @@ def _parse_parameters(self, **pipeline_parameters): def preprocess(self, inputs, **kwargs) -> dict: # tokenizer encoder + if self.tokenizer.pad_token == None: + self.tokenizer.pad_token = "[PAD]" inputs = self.tokenizer(inputs, return_tensors="of", padding=True) inputs = { "input_ids": inputs.input_ids, @@ -75,7 +77,7 @@ def preprocess(self, inputs, **kwargs) -> dict: return inputs def forward(self, inputs, **kwargs) -> dict: - outputs = self.model.generate(inputs["input_ids"], do_sample=True, max_length=50, **kwargs) + outputs = self.model.generate(inputs["input_ids"], max_length=50, **kwargs) return {"return_ids": outputs} def postprocess(self, model_output_dict, **kwargs) -> dict: @@ -99,7 +101,7 @@ def postprocess(self, model_output_dict, **kwargs) -> dict: mode="libai", ) - text = ["a dog"] + text = ["a dog", "a cute pig", "a cute girl"] output = pipeline(inputs=text) if dist.is_main_process(): print(output) From 5cce818389760fc108c9ac8522d4a5e46ed4538a Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Tue, 11 Apr 2023 07:36:40 +0000 Subject: [PATCH 16/25] reformat --- libai/inference/generator/generation_utils.py | 6 ++- .../mock_transformers/mock_tokenization.py | 51 +++++++++++++++---- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/libai/inference/generator/generation_utils.py b/libai/inference/generator/generation_utils.py index d39182cb6..2463c2a6e 100644 --- a/libai/inference/generator/generation_utils.py +++ b/libai/inference/generator/generation_utils.py @@ -478,8 +478,10 @@ def greedy_search( unfinished_sequences = flow.ones(input_ids.shape[0]) cur_len = input_ids.shape[-1] while True: - if input_ids.size(0) > 1: - input_ids = input_ids.to_global(sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast])) + if input_ids.size(0) > 1: + input_ids = input_ids.to_global( + sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast]) + ) # prepare model inputs model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) diff --git a/projects/mock_transformers/mock_tokenization.py b/projects/mock_transformers/mock_tokenization.py index 2b132067f..46649dd46 100644 --- a/projects/mock_transformers/mock_tokenization.py +++ b/projects/mock_transformers/mock_tokenization.py @@ -1,11 +1,28 @@ +# coding=utf-8 +# Copyright 2021 The OneFlow Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import oneflow as flow + from libai.utils import distributed as dist + flow.mock_torch.enable() -from transformers import BertTokenizer, GPT2Tokenizer, T5Tokenizer, MT5Tokenizer +from transformers import BertTokenizer, GPT2Tokenizer, MT5Tokenizer, T5Tokenizer +from transformers.tokenization_utils_base import * from transformers.utils import generic from transformers.utils.generic import TensorType -from transformers.tokenization_utils_base import * # ---------------- mock TensorType ------------------ @@ -16,13 +33,14 @@ class TensorType(ExplicitEnum): NUMPY = "np" JAX = "jax" + generic.TensorType = TensorType # ---------------- mock convert_to_tensors ------------------ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): if tensor_type is None: - return self + return self # Convert to TensorType if not isinstance(tensor_type, TensorType): @@ -41,7 +59,9 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): is_tensor = tf.is_tensor elif tensor_type == TensorType.PYTORCH: if not is_torch_available(): - raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") + raise ImportError( + "Unable to convert output to PyTorch tensors format, PyTorch is not installed." + ) import torch as_tensor = torch.tensor @@ -50,12 +70,16 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): try: import oneflow except: - raise ImportError("Unable to convert output to OneFlow tensors format, OneFlow is not installed.") + raise ImportError( + "Unable to convert output to OneFlow tensors format, OneFlow is not installed." + ) as_tensor = flow.tensor is_tensor = flow.is_tensor elif tensor_type == TensorType.JAX: if not is_flax_available(): - raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.") + raise ImportError( + "Unable to convert output to JAX tensors format, JAX is not installed." + ) import jax.numpy as jnp # noqa: F811 as_tensor = jnp.array @@ -94,16 +118,21 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): " expected)." ) from e if os.getenv("IS_GLOBAL", True) == True: - size = self['input_ids'].size() + size = self["input_ids"].size() sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]) placement = flow.placement("cuda", list(range(dist.get_world_size()))) - + for k, v in self.items(): if is_tensor != flow.is_tensor: - raise ValueError("Unable to create tensor, you should probably set `return_tensors='of'` ") + raise ValueError( + "Unable to create tensor, you should probably set `return_tensors='of'` " + ) if v.size() != size: - raise ValueError("Unable to create tensor, you should probably padding with `padding=True` ") + raise ValueError( + "Unable to create tensor, you should probably padding with `padding=True` " + ) self[k] = v.to_global(sbp=sbp, placement=placement) return self -BatchEncoding.convert_to_tensors = flow_convert_to_tensors \ No newline at end of file + +BatchEncoding.convert_to_tensors = flow_convert_to_tensors From 23b0422896ad9998b7af3f78a213f968cd4c9019 Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Tue, 11 Apr 2023 09:58:24 +0000 Subject: [PATCH 17/25] refine --- libai/inference/generator/generation_utils.py | 19 ++++++++++--------- .../mock_transformers/mock_tokenization.py | 3 +-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/libai/inference/generator/generation_utils.py b/libai/inference/generator/generation_utils.py index 2463c2a6e..2481a02fa 100644 --- a/libai/inference/generator/generation_utils.py +++ b/libai/inference/generator/generation_utils.py @@ -101,7 +101,7 @@ def _prepare_input_ids_for_generation( shape, dtype=flow.long, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=dist.get_layer_placement(0), ) * -100 ) @@ -113,7 +113,7 @@ def _prepare_input_ids_for_generation( (1, 1), dtype=flow.long, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=dist.get_layer_placement(0), ) * bos_token_id ) @@ -137,7 +137,7 @@ def _prepare_attention_mask_for_generation( inputs.shape[:2], dtype=flow.bool, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=dist.get_layer_placement(0), ) def _prepare_encoder_decoder_kwargs_for_generation( @@ -171,7 +171,7 @@ def _prepare_decoder_input_ids_for_generation( (batch_size, 1), dtype=flow.long, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=dist.get_layer_placement(0), ) * decoder_start_token_id ) @@ -202,7 +202,7 @@ def _expand_inputs_for_generation( ) expanded_return_idx = expanded_return_idx.to_global( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=dist.get_layer_placement(0), ) input_ids = input_ids.index_select(0, expanded_return_idx) @@ -501,7 +501,8 @@ def greedy_search( next_tokens = flow.argmax(next_token_scores, dim=-1) next_tokens = next_tokens.to_global(placement=input_ids.placement) unfinished_sequences = unfinished_sequences.to_global( - sbp=flow.sbp.broadcast, placement=next_tokens.placement + sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), + placement=dist.get_layer_placement(0), ) if eos_token_id is not None: @@ -594,12 +595,12 @@ def multinomial_sample( probs = nn.functional.softmax(next_token_scores, dim=-1) probs = probs.to_global( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=dist.get_layer_placement(0), ).to_local() next_tokens = flow.multinomial(probs, num_samples=1).squeeze(1) next_tokens = next_tokens.to_global( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=dist.get_layer_placement(0), ) unfinished_sequences = unfinished_sequences.to_global( sbp=next_tokens.sbp, placement=next_tokens.placement @@ -692,7 +693,7 @@ def beam_search( (batch_size, num_beams), dtype=flow.float, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=dist.get_layer_placement(0), ) beam_scores[:, 1:] = -1e9 beam_scores = beam_scores.view((batch_size * num_beams,)) diff --git a/projects/mock_transformers/mock_tokenization.py b/projects/mock_transformers/mock_tokenization.py index 46649dd46..8c05acc32 100644 --- a/projects/mock_transformers/mock_tokenization.py +++ b/projects/mock_transformers/mock_tokenization.py @@ -120,7 +120,6 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): if os.getenv("IS_GLOBAL", True) == True: size = self["input_ids"].size() sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]) - placement = flow.placement("cuda", list(range(dist.get_world_size()))) for k, v in self.items(): if is_tensor != flow.is_tensor: @@ -131,7 +130,7 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): raise ValueError( "Unable to create tensor, you should probably padding with `padding=True` " ) - self[k] = v.to_global(sbp=sbp, placement=placement) + self[k] = v.to_global(sbp=sbp, placement=dist.get_layer_placement(0)) return self From 347d6d1fbadd6455878da9cb1dba2e0bc60a60b3 Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Tue, 11 Apr 2023 10:34:12 +0000 Subject: [PATCH 18/25] refine --- .../mock_transformers/mock_tokenization.py | 51 ++++++++++--------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/projects/mock_transformers/mock_tokenization.py b/projects/mock_transformers/mock_tokenization.py index 8c05acc32..2902bd4fd 100644 --- a/projects/mock_transformers/mock_tokenization.py +++ b/projects/mock_transformers/mock_tokenization.py @@ -13,20 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + import oneflow as flow from libai.utils import distributed as dist flow.mock_torch.enable() -from transformers import BertTokenizer, GPT2Tokenizer, MT5Tokenizer, T5Tokenizer -from transformers.tokenization_utils_base import * -from transformers.utils import generic -from transformers.utils.generic import TensorType +from transformers import BertTokenizer, GPT2Tokenizer, MT5Tokenizer, T5Tokenizer # noqa +from transformers.tokenization_utils_base import * # noqa +from transformers.utils import generic # noqa +from transformers.utils.generic import TensorType # noqa # ---------------- mock TensorType ------------------ -class TensorType(ExplicitEnum): +class TensorType(ExplicitEnum): # noqa PYTORCH = "pt" TENSORFLOW = "tf" ONEFLOW = "of" @@ -49,16 +51,17 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): is_tensor = None # Get a function reference for the correct framework if tensor_type == TensorType.TENSORFLOW: - if not is_tf_available(): + if not is_tf_available(): # noqa raise ImportError( - "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed." + "Unable to convert output to TensorFlow tensors format, TensorFlow is not " + "installed." ) import tensorflow as tf as_tensor = tf.constant is_tensor = tf.is_tensor elif tensor_type == TensorType.PYTORCH: - if not is_torch_available(): + if not is_torch_available(): # noqa raise ImportError( "Unable to convert output to PyTorch tensors format, PyTorch is not installed." ) @@ -68,25 +71,24 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): is_tensor = torch.is_tensor elif tensor_type == TensorType.ONEFLOW: try: - import oneflow - except: - raise ImportError( - "Unable to convert output to OneFlow tensors format, OneFlow is not installed." - ) + import oneflow # noqa + except ImportError as e: + msg = "Unable to convert output to OneFlow tensors format, OneFlow is not installed." + raise ImportError(msg) from e as_tensor = flow.tensor is_tensor = flow.is_tensor elif tensor_type == TensorType.JAX: - if not is_flax_available(): + if not is_flax_available(): # noqa raise ImportError( "Unable to convert output to JAX tensors format, JAX is not installed." ) import jax.numpy as jnp # noqa: F811 as_tensor = jnp.array - is_tensor = is_jax_tensor + is_tensor = is_jax_tensor # noqa else: - as_tensor = np.asarray - is_tensor = is_numpy_array + as_tensor = np.asarray # noqa + is_tensor = is_numpy_array # noqa # Do the tensor conversion in batch for key, value in self.items(): @@ -109,15 +111,16 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): if key == "overflowing_tokens": raise ValueError( "Unable to create tensor returning overflowing tokens of different lengths. " - "Please see if a fast version of this tokenizer is available to have this feature available." + "Please see if a fast version of this tokenizer is available to have this " + "feature available." ) from e raise ValueError( - "Unable to create tensor, you should probably activate truncation and/or padding with" - " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your" - f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is" - " expected)." + "Unable to create tensor, you should probably activate truncation and/or " + "padding with 'padding=True' 'truncation=True' to have batched tensors with " + f"the same length. Perhaps your features (`{key}` in this case) have " + "excessive nesting (inputs type `list` where type `int` is expected)." ) from e - if os.getenv("IS_GLOBAL", True) == True: + if os.getenv("IS_GLOBAL", True) is True: size = self["input_ids"].size() sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]) @@ -134,4 +137,4 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): return self -BatchEncoding.convert_to_tensors = flow_convert_to_tensors +BatchEncoding.convert_to_tensors = flow_convert_to_tensors # noqa From 6df6fbc54ec192480609f01439f77bbf57d721ed Mon Sep 17 00:00:00 2001 From: xiezipeng-ML Date: Tue, 11 Apr 2023 10:49:59 +0000 Subject: [PATCH 19/25] reformat --- projects/MagicPrompt/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/MagicPrompt/pipeline.py b/projects/MagicPrompt/pipeline.py index ebc6c2167..56d1fa0d1 100644 --- a/projects/MagicPrompt/pipeline.py +++ b/projects/MagicPrompt/pipeline.py @@ -67,7 +67,7 @@ def _parse_parameters(self, **pipeline_parameters): def preprocess(self, inputs, **kwargs) -> dict: # tokenizer encoder - if self.tokenizer.pad_token == None: + if self.tokenizer.pad_token is None: self.tokenizer.pad_token = "[PAD]" inputs = self.tokenizer(inputs, return_tensors="of", padding=True) inputs = { From 2cfc20d6da35cd1def8f65c8e000ec243b509a13 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 17 Apr 2023 16:42:28 +0800 Subject: [PATCH 20/25] update flow.nn.functional.gelu --- libai/layers/activation.py | 3 +- .../MagicPrompt/configs/gpt2_inference.py | 2 + .../oneflow_magicprompt/config.yaml | 77 ++ .../events.out.tfevents.1681720413.oneflow-32 | 0 .../events.out.tfevents.1681720458.oneflow-32 | Bin 0 -> 68200 bytes .../MagicPrompt/oneflow_magicprompt/log.txt | 1085 +++++++++++++++++ .../oneflow_magicprompt/metrics.json | 325 +++++ projects/MagicPrompt/pipeline.py | 2 +- 8 files changed, 1491 insertions(+), 3 deletions(-) create mode 100644 projects/MagicPrompt/oneflow_magicprompt/config.yaml create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720413.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720458.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt create mode 100644 projects/MagicPrompt/oneflow_magicprompt/metrics.json diff --git a/libai/layers/activation.py b/libai/layers/activation.py index 277e4f057..636552fae 100644 --- a/libai/layers/activation.py +++ b/libai/layers/activation.py @@ -57,8 +57,7 @@ def forward(self, x: flow.Tensor) -> flow.Tensor: """When the approximate argument is 'tanh', Gelu is estimated with: 0.5 * x * (1.0 + flow.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * flow.pow(x, 3.0)))) """ - # return flow.nn.functional.gelu(x, approximate="tanh") - return 0.5 * x * (1.0 + flow.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * flow.pow(x, 3.0)))) + return flow.nn.functional.gelu(x, approximate="tanh") class QuickGELU(nn.Module): diff --git a/projects/MagicPrompt/configs/gpt2_inference.py b/projects/MagicPrompt/configs/gpt2_inference.py index 09f46e004..11dea455a 100644 --- a/projects/MagicPrompt/configs/gpt2_inference.py +++ b/projects/MagicPrompt/configs/gpt2_inference.py @@ -4,7 +4,9 @@ from projects.MagicPrompt.gpt2 import GPTModel, GPTForPreTraining from configs.common.data.gpt_dataset import tokenization from configs.common.train import train +from configs.common.models.graph import graph +graph.enabled=True cfg.update( # Model diff --git a/projects/MagicPrompt/oneflow_magicprompt/config.yaml b/projects/MagicPrompt/oneflow_magicprompt/config.yaml new file mode 100644 index 000000000..ac1d29051 --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/config.yaml @@ -0,0 +1,77 @@ +dataloader: + train: + _target_: libai.data.build_nlp_train_val_test_loader + dataset: + - _target_: libai.data.datasets.GPT2Dataset + data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence + indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} + max_seq_length: 1024 + name: gpt-2 + seed: 1234 + num_workers: 4 + splits: + - [949.0, 50.0, 1.0] + train_val_test_num_samples: null + weights: [1.0] +ds: + _target_: libai.data.datasets.GPT2Dataset + data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence + indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} + max_seq_length: 1024 + name: gpt-2 + seed: 1234 +graph: + auto_parallel: {enable_auto_parallel_ignore_user_sbp_config: false, enabled: false, sbp_collector: false, trunk_algo: true} + debug: 2 + enabled: false + eval_graph: {_target_: libai.models.utils.GraphBase, is_train: false} + global_mode: {enabled: false} + train_graph: {_target_: libai.models.utils.GraphBase, is_train: true} +model: + _target_: projects.MagicPrompt.gpt2.GPTForPreTraining + cfg: {amp_enabled: false, apply_query_key_layer_scaling: true, apply_residual_post_layernorm: false, attention_dropout_prob: 0.1, bias_dropout_fusion: false, bias_gelu_fusion: false, bos_token_id: 50256, chunk_size_feed_forward: 0, decoder_start_token_id: null, diversity_penalty: 0.0, do_sample: false, early_stopping: false, embedding_dropout_prob: 0.1, encoder_no_repeat_ngram_size: 0, eos_token_id: 50256, exponential_decay_length_penalty: null, ffn_hidden_size: 3072, forced_bos_token_id: null, forced_eos_token_id: null, hidden_layers: 12, hidden_size: 768, initializer_range: 0.02, is_encoder_decoder: false, layernorm_epsilon: 1.0e-05, length_penalty: 1.0, max_length: 20, max_seq_length: 1024, min_length: 0, no_repeat_ngram_size: 0, num_attention_heads: 12, num_beam_groups: 1, num_beams: 1, num_return_sequences: 1, output_dropout_prob: 0.1, output_scores: false, pad_token_id: 0, pretrained_model_path: null, remove_invalid_values: false, repetition_penalty: 1.0, scale_mask_softmax_fusion: false, sep_token_id: null, temperature: 1.0, top_k: 50, top_p: 1.0, typical_p: 1.0, use_cache: true, use_scaled_init_for_output_weights: true, vocab_size: 50257} +optim: + _target_: oneflow.nn.optimizer.adamw.AdamW + betas: [0.9, 0.999] + do_bias_correction: true + eps: 1.0e-08 + lr: 5.0e-05 + params: {_target_: libai.optim.get_default_optimizer_params, clip_grad_max_norm: null, clip_grad_norm_type: null, weight_decay_bias: 0.0, weight_decay_norm: 0.0} + weight_decay: 0.01 +tokenization: + append_eod: false + make_vocab_size_divisible_by: 128 + tokenizer: {_target_: libai.tokenizer.GPT2Tokenizer, do_chinese_wwm: true, do_lower_case: true, merges_file: /home/zhangxiaoyu/magicprompt/merges.txt, vocab_file: /home/zhangxiaoyu/magicprompt/vocab.json} +train: + activation_checkpoint: {enabled: false} + amp: {enabled: false} + checkpointer: {max_to_keep: 20, period: 8000} + consumed_train_samples: 0 + consumed_valid_samples: 0 + dist: {data_parallel_size: 1, num_gpus_per_node: 1, num_nodes: 1, pipeline_num_layers: 10000, pipeline_parallel_size: 1, tensor_parallel_size: 1} + evaluation: + enabled: true + eval_iter: 250 + eval_period: 4000 + evaluator: {_target_: libai.evaluation.PPLEvaluator} + global_batch_size: 4 + input_placement_device: cpu + load_weight: '' + log_period: 1 + nccl_fusion_max_ops: 24 + nccl_fusion_threshold_mb: 16 + num_accumulation_steps: 1 + output_dir: projects/MagicPrompt/oneflow_magicprompt + rdma_enabled: false + resume: false + samples: 40000 + scheduler: {_target_: libai.scheduler.WarmupExponentialLR, gamma: 1.0, warmup_factor: 0.0, warmup_iter: 0.0, warmup_method: linear} + seed: 1234 + start_iter: 0 + test_micro_batch_size: 4 + train_epoch: 33 + train_iter: 10000 + train_micro_batch_size: 4 + train_samples: null + warmup_ratio: 0 + zero_optimization: {enabled: false, stage: 1} diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720413.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720413.oneflow-32 new file mode 100644 index 000000000..e69de29bb diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720458.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720458.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..26dee87e72ae2222ee52f685b43005f847c6be94 GIT binary patch literal 68200 zcmZwQbwCu~*9LIK1ZhRF1rrr4Kv`o4MO_sUgAO|oyAu!_Ma33D#l>#@+TEf^h@fCM zb}RNf%lzIs_nABX{oKQRJacDfXYS0}7LoqxBfHoe9}!O5yIbVCS2F82rf=8?r}h&D zju@wD+GNbec2)nE>(l5#P2=>AzYBBqtxYUVj0f}`*SGh$u;Bx3cAi+KiL7(ox7z>0 zxp%Fb8V7p*FI?8d%EV;c$Z>s#^&U2I>{y%W7aZMtJb8b){QuJ3`ktz1T(kTC(q&AH zO^k+(v8iyldyCc|SHxETU#fc-4_o7as8%%%8){ipxe2>3G}c&Ox;MJU|KbjBw|N^+ zoFT`R8{ryg=e}7Na@zKP`NIE)SCnTrBkA+bX|NexbEiq0VWiv)!~aA^;QFPo8D;b2 zxNG6TiVjUCv3+U44@dlxD<&%=Im{QnEXr_6z zaO4XRzM;5MIL)TKyR*IB9S{apk;}8f*L}Oz2H`=CUDD8kf7lS}prm9?#6ITAM z*QHMyJQ{?RO8k+8#g)RSD%0G0xIXCt!jBYJ3g5pk=WaK&+F}p}l$6V}!YV!m_d$4o zawI(O5v5)4{A(8oO-jiXIidEoUbi}`XBQA|%xzt(aI%z83TxGv=~lgx{sRaj^5nQu z=p9zp-TjQqD-gb+JS(jBvTsWecDf_iC842*XLC*s0pSKpa>6^m^t$`?2bBcj+rQH# zVM(R1bklimbAw)#0Ab@I5&s`o3imy>a@S@!&jsOm%Co||kA_YGVU414T@o64xS+^e z0fdt%$qE0x((4)x-1!fL^&C%0n_NmM44xk2w%f8wJrLffxKh}6ft7nm?M5CT?Bpbu zXN4!K=ky2RddiVd*DOj~u1?Bt5PqU0C!BXvud8VFG8KeZ`~H=LrIo_HR~EY6Iq~@< z2&eax<4U2Gvz7a&@>`aI@HORG;h}N4fgtSEU#?5SPK~3qsa0c_fp8rqIpK;7y)G;v zy$%Q~oNiOQa2^^fh55C%x^1cW?+FMk&&YA5(EXl;dqh_D2M|uBJS&X;(a;%$Hz-HK zIhrW#qNbnfg0S{kc?&pU%ssuX|8WN$2uID0l!PWqVT~E*-AqkgDuD3wJUOluj-F%g zp5WCd7KB6R%XwBfA%8{$2)|K|gy+S1IR9*APZ0XX$Q3zZq&N?=o((htVPE?*lCX?Y znBafGEpXo5e<18$Q;sWzx=Uv6cXv%U1K}CUv%(=|Pu2yYZ7sPj2@S{6RY{YKK^R6! zPFPtymd;;1x&VZOw->2XxR;tLg|>|^y7^XVx*CMJJLI@hSgoX)dul*vUl4ZRDd$lX;y zHk0E@;jwe3?%(1a2ZL}wEkIgB%!%dI4SG4+wRfBmw<53R5`8``b3(#@4fZl5C|(ylk==_`Mg9+5RRlA3G0YL zuc=4&K@Trbk`w+F&r3_*D!&hU7<=ukB(zWpZ(8TN%~%mU3xp3-<+xIq@u`gaLEAJh z5c*%2^Q_RRrfVM%9;F-!7xs?QzKqDQ1Yy}Uxgsa5BnsaiDQ^nGeO^X&3+JJwQrNgh zuG_-EI|o2`pqm_53OxsvakuYo(;9^K-g2H5&K~6A0m3PiBcY*(|IMA#3xwAx$q7%1 z!b<0U{Q}{O$hMNuN-6x}|Hf^8mp3axxGhSKD}@?xfybv74ECj$`^!V zDMvy>5A*KSYy-lpl;nhY;*mxuh#UmMu}`BVp|w(I=;8O~0d^oveJ011!k+(3+z<60 zTpWbLb2-lnBf_)KfN(A4NNDKcxR$#ng770HIpK7%hkv!}W`gid@OepCRw*>}P#@;< z6@-Z)a$G6gdep?d^P$w*AS@Ls=UL$}^SFv2451teBl<>ZkIb$V0Kz?#tLV`;4t znU)~j5ml^S;XEv-6dH~+i_fjV?-oHbEBqz)JwZzJqWBk)T}+a7ku`65g7EDQ4@p>2DXdZZpId&ajT;C%?vmr2 za8e2Pq|koPKzN4otT5%{zZxL4-7VKe;mq8(7eF|ilAJJC6goLS3kTsa)7g@+lA3U~ zUF9(#oMI-&mBPk0CEVYP|5zD>e<{xjbC%^#0%4fBTo;Aa8x7qI!t0degkO!5b$yyF z`whZ!kryRlWu@?yi;?DH$qD5_SU_>5u;6iV_j%LG;(6(aD7ic<4Bt7lD+sSqj)Y~x zqqP2E!}f!)`V6@uCtM@;@ay9Bcom~aW{LWR^RSAVFvP9hY!I5?kmE|>gi*!ZuXacb zfgVOuo)sS7)HDx-wuEDRF&gO;gt(T-K(A(F&Tt!DbEV`rbOKVVf$)wT@=1vxN-yt<0;7r1H>MF z`ru#=!Y(uBNWyALpvENsKaH~zJ28HvmhEizw zJY4yEg&Fj4eR(;q6gt!^;vV1r$8HdsRgm+nuuqG%+dw#oawIGk6r~Niy4nte$0*4O z%ZkDVBYxtwwho~kC83>~aQ#UWcMuluEytC@oxy*FzrBslfN&(`S>e&W{YrrF66Hv^ z(^|AkfaC`1$Nmx@&=eK-jVB~aQqs(R1ii|k`w+A?;n;tx3mff zE8Zy8uy7vMQVN}Ce{ribX-xD(&AvYbQ`+@kUB+AhkG(6EU5(tYPmn(9@*?;xAgy^p4 zKp1)9iX^P7CM<7rVGRhoU6kWWVZ7y6Au9Oo91xzMJS#NqSZxIe%UzP|lF+c1noVsOia>B`C57YX5@&aMEU!@%j=V3ji&@c~$uw$VhobX$YD}`4cd=YMUIy(=9`IKjc z^?ogx4Z^m6(6OIt)VTswx@<2GgNhe9@pcFoR@WU-^O2`Hf_H8Q1 zmBQ6FUxe`IGr~ZaMtN2kF?;YB5Y}oY*Ck=&4pG`s2Tpwi;Y3Pu!dC@)U8ggDwt~>x zW4ypqg54UVM6A!}vl;nil#2(%~)Yb-sb;_7H70$zkN@0fnyITj>Rue(!V=Bj$ z!j9qZg+$L~rXajRc~-dGY2pbG+MCIBNoe@A*xI)(UQL`#Nly6ohhF#5?d&=bKKAb{ z2_4mhB`hM3fN*Dk99IfWI=mCw?5a`>mclo)sRb{Q4yb$5D=ih8{Wwt!oZFyhce* z7$)}cj@uY}5T*~0k%Uf4q2UwX$JS&Z2p5cy<4WO{CU1pZ_nYN`@E7G-VZR!V86fO6 zQm#uv!;xljLjRW_+(JoC`0SfrS9_{iFAzQ+e^nATQWHM&sy!Tp-zLa$rLaZ)H$v;) z&fh>7Hc`&A!U{)=Gy&mR%8}5^B}&^bd;oqPmYXD38tJZQ??~?+*yQYF(3r z&6L73^$XnEr@zhtVUOB!Tq&&Q`cg16bGiY-^OR?WUwW*=^)yxM$aP6**h}Ne^lS#g zWJ+?vKOgkE1E+7|cZe$;j5Ox|amuH2`HoMJ%y;M&*66RHn(tfWRnF&4oMM+NBP+avDXp=GrgsUJsQgw2&g-~Wnf4lT&O3O$V3BFB}&8?_$^-De%H1j6r>XN6Zk&Ho6( zZd>KLB>Z6;r460eW+@1hD9H)mywvL|Z+8m-VZ-NTniS5%7HYzUWvAl(!=e@D7;_!UdG%gfsK?x)CL>`hxJW)>9I?DTOgl zN@`5kUmFKKyei0XrLe{I`@+8RKX3(Y3wJrs3Lm*thymdW%Av60fU(0s_==L8aHrVA z4w^F;K^T2DVpP+}ClxKz2UD{p%VZGCGT@-qp z>Aw+#Qz^*_Z;D5n^CK6li^7;e4gZ30E+skPFtLa0`;8m}!pHYan-zc(CuxQUXS z@ZU4NuIHPyg9aKc|t#OtLg7q@}1y61ICs8tG2Jujl^d(pxKge$wsaiy@zyBy)S zmls}5?BgZpS>Yr5cekL2MZ3v45>_;i(oV`giSuwRB{|_$QP|+f6Dtt*jj!LVa2^Uu z;fa66G*`Q=SOY@+S~;#1CX~+>K3`qd1$vmAAm>@(*n)R>@5we%&Y`erkwbV+6G=%< zcw7`t6}%pUu;lP1lF(gExS-DFnIJ4bLXIniLmu7`#=3f~0^voX1D9H(zi=X&YOIEt6tZ%diO*Sx)$|uE!<0WYQi?pe+~v=l#v`)6FzS^^8y@c-cz0xZoE); zDhPGOsh0D zVGlbig=enS)y%K$vkZjOddYF6FnG-k;gMy>*&uZHk@Ku@*vt<2gyjLsk#NA2v)cVD znoR?t;44?;ghj=Bz%H@gwV{V`(X%CC7d7FJq$#gKIDMuZR}*eJ_$vs6Zz#_Slm8ns z5PH~YmRy&FX4Wa%pwdfDf-sSioN&1K{Ncg5Nw`*D+y0d#^i&gOw%F?n!V4YbxKh}w z_f6qPN{9mpdvuiZtZ=KQL^TkeqZ|oG^i0v#*b_Pzgbj3ZMNXJ2u2i@(H)kO9u+^Y$ zEeiM2u1eu2n?{-!{;m3hP#7%7mBKrhvV|f;HsflObChR=v0rz61EJFpxh@GSuS(Gt z`xcJt+Llq06P7EItQ&BnYcL4+xgL;&UTQ+q{c*)XxTv`tR|+RixFuXJRUgmVeo~$l ze!0_kCeKfN-f!jw^)`Hy;ZBop1Ud2&;CI^Q`cgXZv^%&ZQg)i|71_6;j(3VYaHO*m@%T)acN%~6gkh4;;#3AHpScObG)i*9o1!q-Z54jvFB%>w3417ogEAXyVzPd3g&xiyA;*=%R;?|^ zmBOtdAA~P2oo=F#@~rT|G6xqBKBpWB4WIZ80xj~Ohpp?#Tfhmwi^AapcPs(nB<}=C z=%*&sIEHlqp=WnFt`v5#`zkCdUIy1D9;G}hJUQ`dV-S|_A=f40$+;=o4Pirjf-szt zoUp7Y9AB*Q3lNr!xho0%mBJcPH8ll2;_!)*dlXj_zPTBKSM@tCkjt||UDLcOAlyhf z6h6ytx&eegD9H&MiSzLD?Sy6^+u~WU`ZIL6nbUU(9{cy z!nZ~Q-IC*KLeGx%F2Ov!MtN4ae8OZu5Z1aa*CpX5%@ys}r7vuthZ88t3DZPj)n8|$ zK=|=Nf+P%53g^wLrs-5Xe;V{~&qFz`CR}d6)dqw$9?5xDn38vYGYF?qj)W7;lC|Y) zylVo&EJ||1p5jXOn~jbHfw0~CyOJXIgEeLa}R&8B44?~o~FFBPovn$)@g79QD zIj$zmFGwm5LWk;do)y}kpO6j08I&WT#nLG4h9%ufLl19Kk`p!*SI|b+Te2F2<9$LT zVW?8*nO9yj;#2G^5Vr7@<7&b(CH5Zx;bzLSLdSEn%YpC@y{v#_U5i6?4u^MwcC{mLbtbaTuu16M2l%4 z+(vm;=sk5LKKEqwPOeKr$1378q}r0jKSP(ENT2nu-d-?pIytMJS!|X)}R>luwk-X7lm1s%N_#Z3`%lBXR(Jq zgLhVh9-d7Pk%aw}LeI9vHG6+|$7|jzGvv5Z7}foq@T$}tT<2|+Dd$;X^YuICgV3LH zB&=69N_(J;`y&wUpd=@B5PKN9v*%6_uHKR;3HvLBf6o=uY^WJq5_%ZFRgNo#F?)g?-fq;F#ogOeIJBwe$}*v^KhV2SnI$Kx8Bz8@vRZ>D6SM%^n50iyVMn*#}DzB%d^5$ z*4^;i(-F#%aAt`p?bC5Z?t#!EK(5FM2Z}w^ugW|F!b1Z>CE*~YFx9)j&28U;!_dPa z1Le3<_`b_iA;;2l5bULylxKzOW^0eZJZv~fu1mu0MWVFN;)LfQjHDzdeDq$gOZIDk zZ-#6exK0udRtkgTUb}67dA%kGzfoK%98~P7@HS@95fJtZlFPHgT1PwMo05-Fj)cKq zBekPzMIHd5d9YlO6HXRa?p@ z>|v)8mn*?M9Mz<^Bpj+HbX{_C4+w`emE%fbsr?Uykec&itni$W&<%u*n#pxh z*eo-o9SCPpk`wk3=V7D4b@2^6`myUJVYpJb;oU1YTgwYKLAZIG99IfG>pc)Y4lIT1 zyluwIc~%&{{X0HyF@ka=^n4Jh9d@Z6zMtkSB{|^_aka^+>D^mF4~rguAPI-534fJ8 zi03q4D6SO#xA2~zeN)F1df4ZLT%Hxa_FaxEXpd43g=S7B^`VCrC*_KqFz%^dyt|fb z1j4z!tGgG@!{KVe=*dO!o$UI)a$G4)3AihC?lz(t2q*ND^Q^Gayf|DFW!qoQk?_K? zNbSV^4Lm@&my(>Yp15+a`-U=jC93k;-jZ;HQaH=uyPLz>BwW|lG(nClh4D4-2!1KL zHXzhfo)zBh`4c}6|4@#EhQ0Jg%Vobn*ey}s0#115kzThpW-xwQ6j`!f5{^_8)~;^j z4fC*LoE%pQT`cm1ZmH|>4r%G7a-J3X{R={&o^mK0@?X{CAZ)cvuE+^bJkaYJ?|n1` z_R@)|4VUTcd^mn(9@BjVex%Whoy1%w;dR&QH44@WD7tDSzhS)~=lc{n~njw^-Uw{nHA z8M#{6OCM666}sH*jZc)gCCYV4XxK~Z>^(aUdbpU9oN%rv{Jnc+4hS2b>MaS!D22Tm z{&ahHv<`k+lsYZPl|rYuTw(sNaJ<$wlJcz3c}_cg>-!Pp@lVF}eZtuy)b|NjOd^TzRyVroz=*B6MhkYIUr!+=F`x_mM=pk;RH3|T(2#MKp6E(jw^+Gqi+irI#-N<9{!{} zD-2oFJQRd6ujRTZEU&wc?@fF`Nly6Vs`!qfGoxpN@LKXZNjOm{v|eqh@u+eGpCSFW zNscRp_P=fkckT?t^)%Br%XwD#^LTY55Ef7lg}&`Zm_ZK*Zjmc;!V8!5I_;}z@gO`M zbYBuqQVP%gG1Cm&wC)`AFeO-yD~07lZwaNdUGcpb<3i*-D{K|@?=J|=LggF@v!f!l zPyhbx4Z=y32;rSi{l#M9YQsa!g)AZDU2|()C_+YV-Lb6ZRNO{@cYjnJzy{W zOLO#i?(*?G2t7UIBqywLT71{n?&&em!*MS|B;gdLaA{*} z%^T0=QNlv)^ zxL#K#WA$JVx;9uR38yNBHGbG=W^9Zt146Tga$G6A`Zq`T@pHEe2qP)a3IlapeuMBk z8VS4z6_k83+p~t`zRRnIm}E zPwxf7A%o=dtnktJ4`v`dOF0tOcpjn6?|d1rmzEnWSLB34#U7q`xFZsTt8A*aFPw+d z)r7vc&GDQjy1X1$3QaHO2t8_+C=Ip(b3pa-t{nF#oa~R|?&S^^Y<~6R!vlSz-smQ4bR&VWgUHK(`2d6V|9la$G5_@h)5V zrmuyc_#Y|H3a1|Q-wDFN$8ud1)|%52KP}Euk`vmC!U6BcmxZQe{<|v)qqyJjGaPCT zEY6+*|Am=F?#X#I;rt<%SmCV+aH9pS-QCYM_ zeU1J9qLZvz|1oZ0L^V0)rUT-MjrQv98vh$n_|t=H)2ehRoQyM6nN@1nH3#ODbjmB4 z&-1c{UnN`D1m-Jd0L#?hU2FwRj|{3zOrxa{+LoUdjR)o`Cdx4b_ls{(bn)s1%*Lfc zBxW?noL0h86J4Wc1z-*?O?f3Vpdef5|E^~yFl~%!0L!#5I^+g0w=qFv7Aw0U_vv*xCvQChre|7$#GJ`7g9ln`+#Y)1b5u+0&e>!#h6n6p%w9-DXOLpPsHr@R_- z_r_HlfVm)o2C&RE&oWy9^CuG|=B9uMZL230M+0+UB-P}YZA50xx05;oQ$Mvz$HKWe zTb0>m&8MxvtTUbRYD~Y|)xQAq5Ho;f`s*_*1G7m4RYv9x&r0FI+{i>ZX8S#QU3gbp zyiM_WNU+45qslz)k%kMF?1xfb$!wb-&dt2}SAcn#8Nf2bzUUqTvuQY0MrMogy=wzA zhKX{_uDiuY_UHP~gKqZSlprzZa?GK9Z8dvWIpLA^_-4whG5y!}=nTxMTWA2w%&VIh z0nFD-keG(KX8*&*g8Vk1I#)WG=OFHT3`A)Fe8{CF%5IGL-nKpU}iBwO0A}Z#v=JOLZL40_ z)^s(#Dy#a9Jc+qPm6_$4o(0UtS(H~}7EK%b5ST}q0W9<0W|ytNY;cn*6VuR5>m$4I z5VV7da!ji&dR_LhY)k0ouJ}rw3+HAW$282%`UPj*fq8N*<(15*{ci|qdskfm=BNZ3 zz%nP#ZPo~wznLI1m)Op%58WJ_NHsZT>CJlG+n!&%UQ$xZR}AZ{Dwl}y9jv`mSN1?E3y0L%Q@-}5gpL$_09WEKRLI|Ix#Cdx5a zh|KUAJ+=XJi1{6fxtwF>?Ql5GX-CS-#c{S#WLG9iHvxOxMV3`kE&C>#NClf^G zoFNYDfN5(*H96*Kk?C)KxdJe)T2}5-I5$^tOy{qiGWO)RRw0(jWmE|#^qe@1kB@1keG(K`7!=-9bnq)sV2vKrPu4?O;_Mr@vZ&i zCFUxQxnz!@S>^q@?E?sFzXMb0W5Q-{V@E7vWy8Lv+?q2Wr6vfiE_*$ zBJ*osZ(L6O+%!*OuI8B2YBtt5dOgM^iHFQ6&oM7$3im%;hykXZISpW$v6=0HpqrDK zATobgXqo}@Dih_HHXHT2-0Gw7`=(`TMbE;yxrSo~M>=Xs4DJ{N-F(O7mCP@FG6k0} zoy>vR`#Svrmf0(}Y-eB|XM)5uoF1&Xb_Bm~TBlJ>j=6Nb`0MVa67gnF`k4TU8L!Ip zzwycwn48X0UX8hb%#vNev^+-xSZ31K@D;!u$^?mNn46*ho}Pnlo?xOJ(>zhH3##AF z9S%VjW7bH_wH$MMpW2!;zt=1W=36GOWHxM_DYzdV-UXO}W9bjD%zt%fF9GI%Opus& zc0_3B9Iw_Gn5N^XCdZ78*Xx?MdxEbHtr(OmF%vlE&z5$YGKn#G!}~jvS7Y9NRkp-DZDwJycC$p%m9|T zcbCaOU^)+>%E&Agm+J%FT*O2>Ue~fl0e-D+;TA73^&In3>OjqqL9TjW z9?(!;jal0FQbS-4X-NZEW>4?z)4(j#ih{&6bknlS<43?uW}+O^UF_zk)0w#TpwZ%7 ziJ8nXkIfpRnbn{~95CxIp}ZRNaJl$)z}(3UV43!hI_Cq^CXOl-)6mUgj(z-q8O207 zW(l#I`9p()VW*wau41>sxw(mB&dO`A`FQs^{)$q72j!Ja?@Ad$-K}YOoi>FTz%ng* z+{UZ!b=p&9WS%X#uLv+_Fj0~M(rin6U_K2AkeHh}W^h$EO|khy>p?efhEiUQ z>GFHsZRn;?ZyLZdrw?g?cf?LGL1f;a)I0{5HTqCZjyY9iW;HC{4wxyI*GSAQ9P{0$ z=9=(F_xb{J%N5G2F}3A2jeu#BLIYUl>_z=&!%iE{1c_;woA*YfJp|@yCdx6RMCRXs zp2dMVq*1QK+{!TxpPMBX-oeKhM>eLs8naHH`x;<=VFs|wE)7!e12eP9Ze_&UJ6Jug6FZdYZVJf9g0%tP}j zuVh|L%n%l&#C!*4ofsOxGEIu^%K>Ht6C|c#r?veR^#HnglZkT7&?S1EcVMM3V9va> zMq=*Zm>U*5XeM3p#<%r%zD#)~v%X`d5U%-%-!~6316XF(+=N5GG`&KViRm5?sV#TS zZ8k82nJC9xD0Z`IoBH^$|)x>VDuOHqBy7{ntrS64ua~H?FyQrLI z=GZprz&u-l@@mZ8i)?_!6AkJt?ooj4)d_1DG9q(Eyei-_rs=H`g&iVwOJZhUc_74=eR3 zoSXYpnRc}Y;X43cKBBxD)2L64pU}+(k7)qQT%L5L1~4t3P>`60ZWefs*$m7$Cdx5W z#Y51OACGrJH)|XSl$iTfnK{iX;CkAi|0u6yE?Su>Y#rDB5HO7o(g2qE^uO0dfw_?h zBJ<1Anz)#)*&(XQF+;_Hy)E`55j0WghPQDtP>uKZC8nEp(ZWAEYrD9y98hcGC^c6 z?t8f|Fn2Rij=4EjJPXY_p*TOFX({>i+)tE^gM(qUVTxI~v9J(#~ z4KN=vL1G$?w2P~|4h3e@*|ZTH(|4iajVKi&fH^Gqj>J5o%B+~W=?O4Tg-~9}9I`r7 zD44YlyV*392Cz(y_M;4Sa#Rz%@%JSrUSb~Sm?NA`Gz0T$;IE~cFQmMZIj>Wu5aMKH z1>M}k3}BgKkA2JprY4pu6La&LNUg)R&p0>FGf|FtYMx#C=JLT1w)pnl6@0(wk0W7mfn@;_qn^6T+nV2`` zMQX3y@C}2V*5(IAIcCo}dR@xTvAFi2_09l^d5UBDdYWtg%+2}-hakIMlvgs7#9Kwh zTK5TuZZ2mAu*}|5$GijPJ0?iXh8~gHPVOi0>Vd~@+6a#MewKL2t9RNl=;rYFH4^i* zD$}}YW*cDY*HT`|e6cMHpDKPyKXaLLXHZZ9VFi$c;V*dOUp*=U~%3SECMIzPY zm=|a0b$fIp9so0;O|HZ|!!fIkve8s()1x*p4{0f{WcC=JAw0G?`wp0H0u5l9J&f#L z0&_1DB&O^62WS4H2#xp|gj-q=(_Q>oFJwZNS3 zLwP0B=wOD>x|r>6V18i+uuQMr)#?J%)0ZkE)5FaVPxg|SD8~$(uGf7T+u9krIr){p z#5||UY&o&>ePE7#O?f5LFgNeGd)@%%b7la`?6|Jr2{45>RGFBmt0S~;Yi?Qw-CWK@ zIcCLa;=@yCue}6j`=zTT=6Q};XRy7-X60{OHav70<<*#X_s_TwO!MV5fMxn?rsV>2 zEfXYWv~%PZi%>{LvVHs%Bj4+Y<8v$Mo&cP*d;C-UiUkRc$D* zWEytbP+b;YJ!q+=0W8zL)@o~DUS@*CtUW72d$x4=d|-AJs3ynsnWEP@pFM?4r>7PC z3g_lURc7@&(MnfA}qg^+WD7C|>7m;o#^Gt{OGFmEwIVxI05q1`Zc{}^C4 zeoh;~F&9r37lGe0*#JB3>Aiju^AgAG7gkTR%3<_6U|!lsc_nk}f^;GB-lZ16?6#i< zu*}uZTHOHVQzl4E3;zgh;ATbZNh|t;uwZc1M8<{A_Oc0s+5o;#{ z)5-pp#Js{W?Poh`avtu$7tuDUNqIHqgxqjD=;mQ&0Lz?x{`@Uq+Sj7W#B_8JpWiI~ z6rU=Z!$djeXpvd%VSW59sggI#`xnm56jkP_&_=~!r`?xLc{S$VcA4?OT$DotSZ4D( znRpvF@D>G$`MSDz7Ubtp9d=sV+Z5%P_a}%KFl($Y2HpI*!dGHm<(M^IHr2Ee^uvJJ za24g1%!J9}At<8!CScxT2C&T4mSwL(H^Wy`Wn`B7(6tyaE3Ba?$Lum*yj8T$Z2~Y~ zw^}JNuW`(8^;>9~Ec)^nnE9(-*=dAE&y{&uN;Y)s>-Z*@ndyhmiMK+8Z*RWo(FU@lNrD=f7Y_u z1I+P$R2i953;u2ZW>tTRa?GsZ;+4JZ-FQ>pL03MYaBf~#WiG1m7H|CR??ibu=7KwG zngTPRGYw#wb)TQd^|TebP>`60x%v9uxl~|oXQCW4beMSRe&!@zj_&!wM`EU_GOa$( z#RnVx^C_=n8s=v2CB6Fq^CmNZWxlNad@6KPcuAF!xnsoHL%`g~L^z)qWei1KR8KfSv21ZKO#G=OCmw`o!qnDI;ynb`{#1_ARk z6Xlpk2Z@KE22-^#H;?VjmY5l;%sUxV@KvC9_fcNS9CkEa&@EcN9GC(7X#mTt>G=Ew zFfTGeVs5J!q5WxhCli=W4^T~xIlrG?S7&*Nn!wcM*#s8O%}kDIIMR;n7rPz08FGj6 zYRroJQt{!bTxI~vtZ?{yabR}3OO=UPrHOcYaIYA?dgU+^<(Rem>veBCFIf!CjdOh@ z<_%S5T*?Go(m!<`<&{js++3Ko;U;wRF*AT=9v!|n0+=o4Q)Oc2){W2}3eEol-JH)v zIc6uZo10$_C<4s0Jy%G~ELG-TO=Dbp@Vpo0mCQ#Y(uK+|1y^7W^q~PPbL-y=UxAs< z1d-YC*$?by3ty_qF|YK|>$>=yuMW(xe>WxOO^)eh+(h%h^)s#uT2dri&MTP%>ZJ=e z{LCu@)2S#8V3~vsX>X(ouwE4`cH7qOv6RFh+N>@B|Vx%f}Khf=$sTu|ZM%;uO+ zgPk-_LYBRTxmo-N<(166mC}VWV@5^*Gn^T~GQXEH!>`rnm>@FmKD;prc3PXCv=JP$ zc%WW4B7Smb=;no0y(DIiDs$K2Y+OmQbv5PHnCrS#Dho`jH8g-_#&2r29lAM~2_o~` zn#Re%Jj_Hn=Gk6)UBz<#Zou3fv|M7|Qe}SM>Cz0k`81gFO6EG7bRp;6+{?i1A3_6I zX8CKrmcYzmg2Zehe&5WPbp~IH)Gm~2a?CD0^}4BkXYq^J-)1)@=51Bx)j#*}^uX4I z@@mZ2BNS^7W-tR-rt8YU-@v@b1c@2ZF+!V^xpE40)611Mf@9w7uGiV+S*?I>Zg^id zxNvUfa!kV^=#|&GhS1FwA1JS6p2|)W<}K_#9lBZeBMo4gefK?|4$Mdo=32n@1-slbCl@nfcGgeuQqmn@D*zW-E_7 z8Nf`OL<3mn)6kCo(9KShDM-w4*9h&d@!iv5r#-<$IcAKvUKi+45f`(SFPSAV?{Z9! zUfnc~59aLyrbQ{rE15}gX~M6`D@}m8kQu--9Y&7D)zM#=ATnnx_QJ!M|qT2V?Mh592c7Qze599=Fevx@oV)fCP>Wf z?IN@XJKQJ--E4oCYI4jaBD1Y_@=E5wIcY+A z-+yhPn-}A00Lye-d=|Sob1emtS$_FZTVT2+P?Td17MVkT*~J3$ZPBF?^MNY!O#9Dx z8~2G3<&{jsPTTQcX?$`as2B}knUM`sXTaP%#{`i%_+Rhtz^qxEYI4j9A~V`A6IYV7 z%*vFQ4^^4A?g@3En|*FlUdcR?nkE$MI10aS-eU%^%t76n?+2!LHdRLEpIw`G0rL_Q z<(QYe#Gh%o+xQN2bLM!f(89U-NR|1-?P@466DCkz$;=nef+}2T*9Lal$`fe-%e49X z6Q5jI!~}_HIMS9c6M*Z2{xDIF+0Rpa6z*S%9AIXb>Mk)KbIhSX8fiurxw8|PX{9Nz zWE#4;w^{vO!0c#D16byW>fwRV&3#M|nTL8*#z)9)OsFQu{L@LVbN%FkuM^DpZ;8Zw z!ZGt)oHZ%w$L)Z*{UGJlnBO$jZGdTghz79CnJs7L0do)&B<6ha%3hfgtA@i)dyI*4 z%p9FwXK8-Y0lGOjJwswXRb>X4MYjZIuMEnoG2?3{h5+*-Gk|5f<;))mOt(y`jLhdn z4s-=(5)(I?ty}Tvnb5-V$#-%<% zH?w^yuf}|5{U;Kb5Bz8V%RF)D#cE)l_opB+4Lj|GBU9agnGirxj=80+cxCUiDX!V; zRc49Ae8DjdbMtZ8^}m4mz?AYz=B06I!tX2JIsh}=j0UhwGm}GjfAawoB&MO8Ni8Pi z0@KT!YI4j~?s{FXsn6>}H+|QoOU!(Z>6zX|Gvq}-Tre|eJ>}Jy&HoF>mvvO$Km%B& z(bCu+(9I1@5SiU39bW*eCP+-fPCMv* z-TT1oU_muGW|!7_UC0lQVZd}wTO=`GbIc7E{WU$zEk;8(how_qjTxENeLQsYBQtBQhMjhN231C8QI~(Az)WSL9J8p%jLhov7MMmI(j?{^Rc6*HXZ2?u*_F)r!@oS3MNQQ!%pip?8*vYzF?vp^KdJ@F0NJ^{J!bi*1T`w+gEV^G z;D^eIpzmvy>8*+Wc+!^DHdrG^8?2$ zeZH+`c;89E*iB2yt1*w=b;NZ+mzV)8b5^(F@xZKXMU{zZ=;rpoQlo%5jEQp0o6YsQ zvolj>LpOhRHSbqAH$SQ}r#F6#zyEvGi}Ffl`_I>f`oD*Fgl<;tMgv&pCX1OzfjO25 zBJA5wYKsUR*S|l+)bId{I+%#0a6ue-9Av)94vLNWc)Kwx$mK?7K3aHp~O z!t;YnkeGquPfG+vJjUNcsW_5qa?CLCEa+C}HzvSra;cld{HDrm{9`$;r>%CG@=E57 z#n**tWt!qm`6bK%mN~dyP&(|i`Am?Q8)k}26Qf=)fo^K9&_-~~E8?m9yO7iZVA`23 zk(l2(rs-63&GpE`O<||~%;eRWRjSvl2HgxYqd&khPfcBqzdW^%2@=!jU4*vx#3uOe zXd`o~$uU=J^}2$WC(i)$qF08*EKp@?Cw;`X4M%sQypq{!=yf4^Ro*#Z<}(9Wrc;SY zlcAffys0uW>*f@6gl@(%QI0uS{Jz!<-4ox;=5mSh zN~ZDT>%zUJ#Y2I)h8e&zZ?BzJ9+)4PATe8rPvf4bsKt}Lc9&@*IOY!VB1zOy_hZ0p zx6xZ-{^XdO=N8deFSLz-oz_85c_s5g=yky>;&3xyE@K9;%(UJGt%3QP2_n;D#);j~ z%~r{@5gfCg*v+h6b^8Hx=YLBi<}X#|mfE**p=sO^ac(v_Toa!W+ZdE7F@JN+g{%L%^~s#l3c5KxnDT1O zRiSSAz?U=H5C{+WWnexO6C`G2 z<0x&=TI==PP-^$-&J6?8I>t9ql}b4F>2Up z>zljclk)bXDX+%-^hbv~?NVj{%UpYB+YVs9W`f8Z_T?8|j&3!EHiBag5xaT0*V;|M zEY;C!P~qI1!7&Yop!@v`4#7^lL`QigbK9C!AtrnD5^gNrJg)@Ds9$$lZt4jq?O^&G(Pu(jS?JWRiwGnX=bEYb@-u3VJvW}#YlvgtEk4qI+ zN3X}%@Rk`x16ZbmPxvfg4rGGJ^iMp7KSpqviE_+D@rzjVO~0oBb9l8(i8+g78qR{2 zxX0~=Zf>tmc_p)+E>-wwop=$t>0g5eu*_FuvUdaXG7}`G;YceyE7J#dT1Pvo$uU2R zR}VVAn!O#E+Rv7Q3+Lu+j%hfjJ+|2795DNQp}ZQ?JYl69F!PxKEYn--i|@b;`bw3F zX*fNYaC~MUFl&CJD93ytUZ>q(dmG+EiQMchG3Rj1jh9Pk!WvBo1Ln9blviWAPI7Dj z%rDFUmf2c6ISjhlXDd}Err~67)m(f0B6gjLa?DimlGo{0cJ-i}Tg{h9%()!X`@9&BO|pVwx{Dx z`R(j?|Wj@fxxMNOr<)$y%~_ZL!Ljd^*?8T?v(ES3ha z%o@9vT><9AMHD3F<82Yz!F@kGghNoh#T4b3UQT-5>y-39z>F+!9#%LvV>xCAv1tw zCjK1P61q8a0aYfZNl1jY#OcOMf%%n*a?IHc^|~R=obW7Yx$||2xrAfh_VCu|j&7d^ z%#$vZS27LVH1-Tihi(SD(g2p(u<>O4(bz{!keJXcVwdfoWt1k4>ZXaLJJxzIHPy4k~yg2Xg* zv!r?7d%(QJL^bh~A`~YUP z%QS#xE{wTU1(-*eATbTy3~AJ%EHE2hp_&}CxX8Rzz9W8adYxP-F_&@7CG9*kl}7i! z2h78#D6eEbUUpqDp1ZU)FsGfS0W7ok?Nyh6X@7=-$ZT?~vOO@*Fj0=_;UNBq&x{?o zY}nTEy2M=0F-N>1Y z4?7Rc#Y~iAUJ#iNibp%Zk=Dc1EWB`Tu25yhzUUeSOu>xuN~WQk%WjUwOI}-;0W8yX z)q7ll`G*N2Gi{H~2AE#vv=JP$xX2t|@(3P+mRWX{m@7Hv=2`7E+XD~$1LhPf%BwLu zjdm`8Za!cJu*_rnS~0NGI$Kj^WG;_8a|)QVnJC9>Ur(mf=%(Z4g%WcW$E&Q5A&1I&BO0G1is%yS!b(`z(UCZ^#? z`*~COjnK`bOq64m5Scz3M~nsLgqCK*3g_k;j#=8;PE+aEFbiM~Z%uh6^TW97Lf3~Q zt%2F74GmzK896UY0yCEh5_6>Z97?Y4bT%*-XsIU0d{|enGfy~O1emizyGqP>Rpyl$ zXK)2(KyS(`nT8{+r*6^~V4h+Iu*_>|*YVlva($>WG4FMV(2nff6i@aBF;R|rQDg?| z>^B4Ryk)G!T&v2gexf7kbE?Q}6Y-)dbaTsi)8U14Gf|b9lF@f5FfUJ_ zyc%=*-Ux5lX@e)y0G3%$zbsx0dc_2ZX_%YsFWfl-%zl%oCdbULsn_*BJO7V-%}y<-GBKM@ zj?lheInfw)+IS|)F)LNk>skj^EdXZDoCOk7uga|T#|M}6Tg;=p8gu%15B#~b-OK=% zsq^^r7?`f}sWLIEuM$rW{Fdzn=0PUPF((_kIiwN3NGGf1HHn$bF{h1k(!`zje-1nC z#a5J8V~$w24S$TlsWlB?nZ`{I{{ZGpCWy=rWq#r79kQ7y$IPp&*ByO1t08o=&*L&9 z3+Lu0Rp#KJ&EtUC>j~x6n9-SmMzGVKWCpOz)(+W&fNA}dDkC#9uxvNzW0?S8!7=;U>UI0(*scQRopRSC=2lf^z}~RIu+x?+PkAM? zq)DpK%x*yrbaOv5fMq(hZ-@(KoGVaeWCrEM;M<1xGEt6M-$t(+G@ compiling dataset index builder ... +[04/17 16:33:31] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds +[04/17 16:33:31] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.057 seconds +[04/17 16:33:33] lb.engine.default INFO: Prepare training, validating, testing set +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: reading document index... +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008427 seconds +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[04/17 16:33:33] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[04/17 16:33:33] libai INFO: > Start building model... +[04/17 16:33:33] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[04/17 16:33:33] libai INFO: >>> done with building model. Building time: 0.152 seconds +[04/17 16:33:33] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[04/17 16:33:33] lb.engine.trainer INFO: Starting training from iteration 0 +[04/17 16:33:34] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0014 s/iter lr: 5.00e-05 +[04/17 16:33:35] lb.utils.events INFO: eta: 4 days, 6:13:20 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0008 s/iter lr: 5.00e-05 +[04/17 16:33:36] lb.utils.events INFO: eta: 4 days, 5:16:34 iteration: 2/335008 consumed_samples: 12 total_loss: 9.452 time: 1.0883 s/iter data_time: 0.0010 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 +[04/17 16:33:37] lb.utils.events INFO: eta: 4 days, 5:28:00 iteration: 3/335008 consumed_samples: 16 total_loss: 9.38 time: 1.0904 s/iter data_time: 0.0008 s/iter total_throughput: 3.67 samples/s lr: 5.00e-05 +[04/17 16:33:38] lb.utils.events INFO: eta: 4 days, 5:39:27 iteration: 4/335008 consumed_samples: 20 total_loss: 9.309 time: 1.0931 s/iter data_time: 0.0008 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:33:39] lb.utils.events INFO: eta: 4 days, 5:31:22 iteration: 5/335008 consumed_samples: 24 total_loss: 9.213 time: 1.0922 s/iter data_time: 0.0007 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:33:40] lb.utils.events INFO: eta: 4 days, 5:39:25 iteration: 6/335008 consumed_samples: 28 total_loss: 9.118 time: 1.0932 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:33:42] lb.utils.events INFO: eta: 4 days, 5:34:34 iteration: 7/335008 consumed_samples: 32 total_loss: 9.097 time: 1.0928 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:33:43] lb.utils.events INFO: eta: 4 days, 5:32:48 iteration: 8/335008 consumed_samples: 36 total_loss: 9.075 time: 1.0926 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:33:44] lb.utils.events INFO: eta: 4 days, 5:36:04 iteration: 9/335008 consumed_samples: 40 total_loss: 8.992 time: 1.0927 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:33:45] lb.utils.events INFO: eta: 4 days, 5:39:20 iteration: 10/335008 consumed_samples: 44 total_loss: 8.908 time: 1.0927 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:34:17] libai INFO: Rank of current process: 0. World size: 1 +[04/17 16:34:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[04/17 16:34:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = False + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 +optim.params.clip_grad_max_norm = None +optim.params.clip_grad_norm_type = None + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=False), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[04/17 16:34:17] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[04/17 16:34:17] lb.engine.default INFO: > compiling dataset index builder ... +[04/17 16:34:17] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.054 seconds +[04/17 16:34:17] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.055 seconds +[04/17 16:34:18] lb.engine.default INFO: Prepare training, validating, testing set +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: reading document index... +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007285 seconds +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[04/17 16:34:18] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[04/17 16:34:18] libai INFO: > Start building model... +[04/17 16:34:18] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[04/17 16:34:18] libai INFO: >>> done with building model. Building time: 0.146 seconds +[04/17 16:34:18] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[04/17 16:34:18] lb.engine.trainer INFO: Starting training from iteration 0 +[04/17 16:34:19] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0011 s/iter lr: 5.00e-05 +[04/17 16:34:20] lb.utils.events INFO: eta: 3 days, 21:22:14 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0007 s/iter lr: 5.00e-05 +[04/17 16:34:21] lb.utils.events INFO: eta: 3 days, 21:19:35 iteration: 2/335008 consumed_samples: 12 total_loss: 9.452 time: 1.0029 s/iter data_time: 0.0008 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:22] lb.utils.events INFO: eta: 3 days, 21:17:20 iteration: 3/335008 consumed_samples: 16 total_loss: 9.38 time: 1.0025 s/iter data_time: 0.0007 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:23] lb.utils.events INFO: eta: 3 days, 21:15:05 iteration: 4/335008 consumed_samples: 20 total_loss: 9.309 time: 1.0005 s/iter data_time: 0.0006 s/iter total_throughput: 4.00 samples/s lr: 5.00e-05 +[04/17 16:34:24] lb.utils.events INFO: eta: 3 days, 21:17:18 iteration: 5/335008 consumed_samples: 24 total_loss: 9.213 time: 1.0028 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:25] lb.utils.events INFO: eta: 3 days, 21:15:03 iteration: 6/335008 consumed_samples: 28 total_loss: 9.118 time: 1.0022 s/iter data_time: 0.0006 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:26] lb.utils.events INFO: eta: 3 days, 21:12:04 iteration: 7/335008 consumed_samples: 32 total_loss: 9.097 time: 1.0020 s/iter data_time: 0.0006 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:27] lb.utils.events INFO: eta: 3 days, 21:11:32 iteration: 8/335008 consumed_samples: 36 total_loss: 9.075 time: 1.0019 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:28] lb.utils.events INFO: eta: 3 days, 21:13:16 iteration: 9/335008 consumed_samples: 40 total_loss: 8.992 time: 1.0021 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:29] lb.utils.events INFO: eta: 3 days, 21:14:59 iteration: 10/335008 consumed_samples: 44 total_loss: 8.908 time: 1.0029 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:31] lb.utils.events INFO: eta: 3 days, 21:17:12 iteration: 11/335008 consumed_samples: 48 total_loss: 8.861 time: 1.0033 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:32] lb.utils.events INFO: eta: 3 days, 21:19:25 iteration: 12/335008 consumed_samples: 52 total_loss: 8.814 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:33] lb.utils.events INFO: eta: 3 days, 21:19:44 iteration: 13/335008 consumed_samples: 56 total_loss: 8.798 time: 1.0037 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:34] lb.utils.events INFO: eta: 3 days, 21:19:35 iteration: 14/335008 consumed_samples: 60 total_loss: 8.781 time: 1.0036 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:35] lb.utils.events INFO: eta: 3 days, 21:19:28 iteration: 15/335008 consumed_samples: 64 total_loss: 8.747 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:36] lb.utils.events INFO: eta: 3 days, 21:19:33 iteration: 16/335008 consumed_samples: 68 total_loss: 8.712 time: 1.0037 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:37] lb.utils.events INFO: eta: 3 days, 21:19:26 iteration: 17/335008 consumed_samples: 72 total_loss: 8.7 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:38] lb.utils.events INFO: eta: 3 days, 21:19:31 iteration: 18/335008 consumed_samples: 76 total_loss: 8.687 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:39] lb.utils.events INFO: eta: 3 days, 21:19:44 iteration: 19/335008 consumed_samples: 80 total_loss: 8.67 time: 1.0036 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:40] lb.utils.events INFO: eta: 3 days, 21:19:29 iteration: 20/335008 consumed_samples: 84 total_loss: 8.653 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:41] lb.utils.events INFO: eta: 3 days, 21:19:42 iteration: 21/335008 consumed_samples: 88 total_loss: 8.572 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:42] lb.utils.events INFO: eta: 3 days, 21:19:27 iteration: 22/335008 consumed_samples: 92 total_loss: 8.491 time: 1.0033 s/iter data_time: 0.0004 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:43] lb.utils.events INFO: eta: 3 days, 21:19:20 iteration: 23/335008 consumed_samples: 96 total_loss: 8.49 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:44] lb.utils.events INFO: eta: 3 days, 21:19:13 iteration: 24/335008 consumed_samples: 100 total_loss: 8.489 time: 1.0029 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:45] lb.utils.events INFO: eta: 3 days, 21:19:18 iteration: 25/335008 consumed_samples: 104 total_loss: 8.479 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:46] lb.utils.events INFO: eta: 3 days, 21:19:11 iteration: 26/335008 consumed_samples: 108 total_loss: 8.468 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:47] lb.utils.events INFO: eta: 3 days, 21:19:16 iteration: 27/335008 consumed_samples: 112 total_loss: 8.463 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:48] lb.utils.events INFO: eta: 3 days, 21:19:21 iteration: 28/335008 consumed_samples: 116 total_loss: 8.458 time: 1.0031 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:49] lb.utils.events INFO: eta: 3 days, 21:19:14 iteration: 29/335008 consumed_samples: 120 total_loss: 8.407 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:50] lb.utils.events INFO: eta: 3 days, 21:19:07 iteration: 30/335008 consumed_samples: 124 total_loss: 8.355 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:51] lb.utils.events INFO: eta: 3 days, 21:19:06 iteration: 31/335008 consumed_samples: 128 total_loss: 8.342 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:52] lb.utils.events INFO: eta: 3 days, 21:19:04 iteration: 32/335008 consumed_samples: 132 total_loss: 8.329 time: 1.0029 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:53] lb.utils.events INFO: eta: 3 days, 21:18:24 iteration: 33/335008 consumed_samples: 136 total_loss: 8.318 time: 1.0029 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:54] lb.utils.events INFO: eta: 3 days, 21:17:44 iteration: 34/335008 consumed_samples: 140 total_loss: 8.307 time: 1.0029 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:55] lb.utils.events INFO: eta: 3 days, 21:17:27 iteration: 35/335008 consumed_samples: 144 total_loss: 8.305 time: 1.0028 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:56] lb.utils.events INFO: eta: 3 days, 21:17:11 iteration: 36/335008 consumed_samples: 148 total_loss: 8.304 time: 1.0027 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:57] lb.utils.events INFO: eta: 3 days, 21:17:25 iteration: 37/335008 consumed_samples: 152 total_loss: 8.215 time: 1.0028 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:58] lb.utils.events INFO: eta: 3 days, 21:17:40 iteration: 38/335008 consumed_samples: 156 total_loss: 8.126 time: 1.0028 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:34:59] lb.utils.events INFO: eta: 3 days, 21:17:23 iteration: 39/335008 consumed_samples: 160 total_loss: 8.111 time: 1.0027 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:00] lb.utils.events INFO: eta: 3 days, 21:17:07 iteration: 40/335008 consumed_samples: 164 total_loss: 8.095 time: 1.0026 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:01] lb.utils.events INFO: eta: 3 days, 21:16:55 iteration: 41/335008 consumed_samples: 168 total_loss: 8.091 time: 1.0026 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:02] lb.utils.events INFO: eta: 3 days, 21:16:43 iteration: 42/335008 consumed_samples: 172 total_loss: 8.087 time: 1.0025 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:03] lb.utils.events INFO: eta: 3 days, 21:16:53 iteration: 43/335008 consumed_samples: 176 total_loss: 8.063 time: 1.0025 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:04] lb.utils.events INFO: eta: 3 days, 21:17:03 iteration: 44/335008 consumed_samples: 180 total_loss: 8.04 time: 1.0025 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:05] lb.utils.events INFO: eta: 3 days, 21:16:51 iteration: 45/335008 consumed_samples: 184 total_loss: 8.01 time: 1.0025 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:06] lb.utils.events INFO: eta: 3 days, 21:16:39 iteration: 46/335008 consumed_samples: 188 total_loss: 7.98 time: 1.0024 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:07] lb.utils.events INFO: eta: 3 days, 21:15:55 iteration: 47/335008 consumed_samples: 192 total_loss: 7.949 time: 1.0023 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:08] lb.utils.events INFO: eta: 3 days, 21:15:12 iteration: 48/335008 consumed_samples: 196 total_loss: 7.918 time: 1.0023 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:09] lb.utils.events INFO: eta: 3 days, 21:15:53 iteration: 49/335008 consumed_samples: 200 total_loss: 7.886 time: 1.0024 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:10] lb.utils.events INFO: eta: 3 days, 21:15:10 iteration: 50/335008 consumed_samples: 204 total_loss: 7.854 time: 1.0021 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:11] lb.utils.events INFO: eta: 3 days, 21:15:51 iteration: 51/335008 consumed_samples: 208 total_loss: 7.851 time: 1.0021 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:12] lb.utils.events INFO: eta: 3 days, 21:15:08 iteration: 52/335008 consumed_samples: 212 total_loss: 7.847 time: 1.0021 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:13] lb.utils.events INFO: eta: 3 days, 21:14:57 iteration: 53/335008 consumed_samples: 216 total_loss: 7.841 time: 1.0020 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:14] lb.utils.events INFO: eta: 3 days, 21:15:06 iteration: 54/335008 consumed_samples: 220 total_loss: 7.834 time: 1.0021 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:15] lb.utils.events INFO: eta: 3 days, 21:14:55 iteration: 55/335008 consumed_samples: 224 total_loss: 7.792 time: 1.0019 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:16] lb.utils.events INFO: eta: 3 days, 21:14:44 iteration: 56/335008 consumed_samples: 228 total_loss: 7.75 time: 1.0019 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:17] lb.utils.events INFO: eta: 3 days, 21:14:53 iteration: 57/335008 consumed_samples: 232 total_loss: 7.705 time: 1.0019 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:18] lb.utils.events INFO: eta: 3 days, 21:15:02 iteration: 58/335008 consumed_samples: 236 total_loss: 7.66 time: 1.0020 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:19] lb.utils.events INFO: eta: 3 days, 21:14:51 iteration: 59/335008 consumed_samples: 240 total_loss: 7.658 time: 1.0019 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:20] lb.utils.events INFO: eta: 3 days, 21:14:40 iteration: 60/335008 consumed_samples: 244 total_loss: 7.657 time: 1.0018 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:21] lb.utils.events INFO: eta: 3 days, 21:14:23 iteration: 61/335008 consumed_samples: 248 total_loss: 7.652 time: 1.0018 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:22] lb.utils.events INFO: eta: 3 days, 21:14:07 iteration: 62/335008 consumed_samples: 252 total_loss: 7.648 time: 1.0018 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:23] lb.utils.events INFO: eta: 3 days, 21:14:06 iteration: 63/335008 consumed_samples: 256 total_loss: 7.628 time: 1.0017 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 +[04/17 16:35:24] lb.utils.events INFO: eta: 3 days, 21:14:05 iteration: 64/335008 consumed_samples: 260 total_loss: 7.608 time: 1.0089 s/iter data_time: 0.0005 s/iter total_throughput: 3.96 samples/s lr: 5.00e-05 +[04/17 16:35:26] lb.utils.events INFO: eta: 3 days, 21:14:19 iteration: 65/335008 consumed_samples: 264 total_loss: 7.562 time: 1.0179 s/iter data_time: 0.0006 s/iter total_throughput: 3.93 samples/s lr: 5.00e-05 +[04/17 16:35:27] lb.utils.events INFO: eta: 3 days, 21:14:34 iteration: 66/335008 consumed_samples: 268 total_loss: 7.517 time: 1.0267 s/iter data_time: 0.0007 s/iter total_throughput: 3.90 samples/s lr: 5.00e-05 +[04/17 16:35:29] lb.utils.events INFO: eta: 3 days, 21:14:43 iteration: 67/335008 consumed_samples: 272 total_loss: 7.516 time: 1.0350 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:35:30] lb.utils.events INFO: eta: 3 days, 21:14:52 iteration: 68/335008 consumed_samples: 276 total_loss: 7.514 time: 1.0429 s/iter data_time: 0.0007 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:35:32] lb.utils.events INFO: eta: 3 days, 21:15:33 iteration: 69/335008 consumed_samples: 280 total_loss: 7.482 time: 1.0508 s/iter data_time: 0.0008 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:35:34] lb.utils.events INFO: eta: 3 days, 21:16:15 iteration: 70/335008 consumed_samples: 284 total_loss: 7.45 time: 1.0586 s/iter data_time: 0.0008 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 +[04/17 16:35:35] lb.utils.events INFO: eta: 3 days, 21:16:25 iteration: 71/335008 consumed_samples: 288 total_loss: 7.446 time: 1.0661 s/iter data_time: 0.0008 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 +[04/17 16:35:37] lb.utils.events INFO: eta: 3 days, 21:16:35 iteration: 72/335008 consumed_samples: 292 total_loss: 7.442 time: 1.0732 s/iter data_time: 0.0008 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 +[04/17 16:35:38] lb.utils.events INFO: eta: 3 days, 21:16:49 iteration: 73/335008 consumed_samples: 296 total_loss: 7.412 time: 1.0803 s/iter data_time: 0.0008 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 +[04/17 16:35:40] lb.utils.events INFO: eta: 3 days, 21:17:04 iteration: 74/335008 consumed_samples: 300 total_loss: 7.382 time: 1.0873 s/iter data_time: 0.0008 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 +[04/17 16:35:42] lb.utils.events INFO: eta: 3 days, 21:17:15 iteration: 75/335008 consumed_samples: 304 total_loss: 7.382 time: 1.0939 s/iter data_time: 0.0008 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:35:43] lb.utils.events INFO: eta: 3 days, 21:17:26 iteration: 76/335008 consumed_samples: 308 total_loss: 7.382 time: 1.1003 s/iter data_time: 0.0008 s/iter total_throughput: 3.64 samples/s lr: 5.00e-05 +[04/17 16:35:45] lb.utils.events INFO: eta: 3 days, 21:17:52 iteration: 77/335008 consumed_samples: 312 total_loss: 7.389 time: 1.1066 s/iter data_time: 0.0008 s/iter total_throughput: 3.61 samples/s lr: 5.00e-05 +[04/17 16:35:46] lb.utils.events INFO: eta: 3 days, 21:18:18 iteration: 78/335008 consumed_samples: 316 total_loss: 7.396 time: 1.1124 s/iter data_time: 0.0009 s/iter total_throughput: 3.60 samples/s lr: 5.00e-05 +[04/17 16:35:48] lb.utils.events INFO: eta: 3 days, 21:18:17 iteration: 79/335008 consumed_samples: 320 total_loss: 7.419 time: 1.1183 s/iter data_time: 0.0009 s/iter total_throughput: 3.58 samples/s lr: 5.00e-05 +[04/17 16:35:49] lb.utils.events INFO: eta: 3 days, 21:18:17 iteration: 80/335008 consumed_samples: 324 total_loss: 7.442 time: 1.1241 s/iter data_time: 0.0009 s/iter total_throughput: 3.56 samples/s lr: 5.00e-05 +[04/17 16:35:52] lb.utils.events INFO: eta: 3 days, 21:18:22 iteration: 81/335008 consumed_samples: 328 total_loss: 7.446 time: 1.1407 s/iter data_time: 0.0009 s/iter total_throughput: 3.51 samples/s lr: 5.00e-05 +[04/17 16:35:53] lb.utils.events INFO: eta: 3 days, 21:18:15 iteration: 82/335008 consumed_samples: 332 total_loss: 7.45 time: 1.1390 s/iter data_time: 0.0009 s/iter total_throughput: 3.51 samples/s lr: 5.00e-05 +[04/17 16:35:54] lb.utils.events INFO: eta: 3 days, 21:18:13 iteration: 83/335008 consumed_samples: 336 total_loss: 7.482 time: 1.1373 s/iter data_time: 0.0009 s/iter total_throughput: 3.52 samples/s lr: 5.00e-05 +[04/17 16:35:55] lb.utils.events INFO: eta: 3 days, 21:18:13 iteration: 84/335008 consumed_samples: 340 total_loss: 7.514 time: 1.1357 s/iter data_time: 0.0009 s/iter total_throughput: 3.52 samples/s lr: 5.00e-05 +[04/17 16:35:56] lb.utils.events INFO: eta: 3 days, 21:18:18 iteration: 85/335008 consumed_samples: 344 total_loss: 7.516 time: 1.1342 s/iter data_time: 0.0007 s/iter total_throughput: 3.53 samples/s lr: 5.00e-05 +[04/17 16:35:57] lb.utils.events INFO: eta: 3 days, 21:18:23 iteration: 86/335008 consumed_samples: 348 total_loss: 7.517 time: 1.1326 s/iter data_time: 0.0007 s/iter total_throughput: 3.53 samples/s lr: 5.00e-05 +[04/17 16:35:58] lb.utils.events INFO: eta: 3 days, 21:18:16 iteration: 87/335008 consumed_samples: 352 total_loss: 7.562 time: 1.1311 s/iter data_time: 0.0007 s/iter total_throughput: 3.54 samples/s lr: 5.00e-05 +[04/17 16:35:59] lb.utils.events INFO: eta: 3 days, 21:18:09 iteration: 88/335008 consumed_samples: 356 total_loss: 7.517 time: 1.1296 s/iter data_time: 0.0007 s/iter total_throughput: 3.54 samples/s lr: 5.00e-05 +[04/17 16:36:00] lb.utils.events INFO: eta: 3 days, 21:18:07 iteration: 89/335008 consumed_samples: 360 total_loss: 7.516 time: 1.1281 s/iter data_time: 0.0006 s/iter total_throughput: 3.55 samples/s lr: 5.00e-05 +[04/17 16:36:01] lb.utils.events INFO: eta: 3 days, 21:18:06 iteration: 90/335008 consumed_samples: 364 total_loss: 7.514 time: 1.1266 s/iter data_time: 0.0006 s/iter total_throughput: 3.55 samples/s lr: 5.00e-05 +[04/17 16:36:02] lb.utils.events INFO: eta: 3 days, 21:17:38 iteration: 91/335008 consumed_samples: 368 total_loss: 7.507 time: 1.1252 s/iter data_time: 0.0006 s/iter total_throughput: 3.55 samples/s lr: 5.00e-05 +[04/17 16:36:03] lb.utils.events INFO: eta: 3 days, 21:17:10 iteration: 92/335008 consumed_samples: 372 total_loss: 7.5 time: 1.1238 s/iter data_time: 0.0006 s/iter total_throughput: 3.56 samples/s lr: 5.00e-05 +[04/17 16:36:04] lb.utils.events INFO: eta: 3 days, 21:16:57 iteration: 93/335008 consumed_samples: 376 total_loss: 7.475 time: 1.1225 s/iter data_time: 0.0006 s/iter total_throughput: 3.56 samples/s lr: 5.00e-05 +[04/17 16:36:05] lb.utils.events INFO: eta: 3 days, 21:16:43 iteration: 94/335008 consumed_samples: 380 total_loss: 7.45 time: 1.1212 s/iter data_time: 0.0006 s/iter total_throughput: 3.57 samples/s lr: 5.00e-05 +[04/17 16:36:06] lb.utils.events INFO: eta: 3 days, 21:16:27 iteration: 95/335008 consumed_samples: 384 total_loss: 7.446 time: 1.1198 s/iter data_time: 0.0006 s/iter total_throughput: 3.57 samples/s lr: 5.00e-05 +[04/17 16:36:07] lb.utils.events INFO: eta: 3 days, 21:16:11 iteration: 96/335008 consumed_samples: 388 total_loss: 7.442 time: 1.1185 s/iter data_time: 0.0006 s/iter total_throughput: 3.58 samples/s lr: 5.00e-05 +[04/17 16:36:08] lb.utils.events INFO: eta: 3 days, 21:16:25 iteration: 97/335008 consumed_samples: 392 total_loss: 7.419 time: 1.1174 s/iter data_time: 0.0005 s/iter total_throughput: 3.58 samples/s lr: 5.00e-05 +[04/17 16:36:09] lb.utils.events INFO: eta: 3 days, 21:16:09 iteration: 98/335008 consumed_samples: 396 total_loss: 7.396 time: 1.1161 s/iter data_time: 0.0005 s/iter total_throughput: 3.58 samples/s lr: 5.00e-05 +[04/17 16:36:10] lb.utils.events INFO: eta: 3 days, 21:15:57 iteration: 99/335008 consumed_samples: 400 total_loss: 7.395 time: 1.1149 s/iter data_time: 0.0005 s/iter total_throughput: 3.59 samples/s lr: 5.00e-05 +[04/17 16:36:11] lb.utils.events INFO: eta: 3 days, 21:16:07 iteration: 100/335008 consumed_samples: 404 total_loss: 7.395 time: 1.1138 s/iter data_time: 0.0005 s/iter total_throughput: 3.59 samples/s lr: 5.00e-05 +[04/17 16:36:12] lb.utils.events INFO: eta: 3 days, 21:15:55 iteration: 101/335008 consumed_samples: 408 total_loss: 7.389 time: 1.1126 s/iter data_time: 0.0005 s/iter total_throughput: 3.60 samples/s lr: 5.00e-05 +[04/17 16:36:13] lb.utils.events INFO: eta: 3 days, 21:15:43 iteration: 102/335008 consumed_samples: 412 total_loss: 7.382 time: 1.1115 s/iter data_time: 0.0005 s/iter total_throughput: 3.60 samples/s lr: 5.00e-05 +[04/17 16:36:14] lb.utils.events INFO: eta: 3 days, 21:14:59 iteration: 103/335008 consumed_samples: 416 total_loss: 7.382 time: 1.1104 s/iter data_time: 0.0005 s/iter total_throughput: 3.60 samples/s lr: 5.00e-05 +[04/17 16:36:15] lb.utils.events INFO: eta: 3 days, 21:14:16 iteration: 104/335008 consumed_samples: 420 total_loss: 7.381 time: 1.1093 s/iter data_time: 0.0005 s/iter total_throughput: 3.61 samples/s lr: 5.00e-05 +[04/17 16:36:16] lb.utils.events INFO: eta: 3 days, 21:14:57 iteration: 105/335008 consumed_samples: 424 total_loss: 7.379 time: 1.1083 s/iter data_time: 0.0005 s/iter total_throughput: 3.61 samples/s lr: 5.00e-05 +[04/17 16:36:17] lb.utils.events INFO: eta: 3 days, 21:15:39 iteration: 106/335008 consumed_samples: 428 total_loss: 7.378 time: 1.1073 s/iter data_time: 0.0005 s/iter total_throughput: 3.61 samples/s lr: 5.00e-05 +[04/17 16:36:18] lb.utils.events INFO: eta: 3 days, 21:14:55 iteration: 107/335008 consumed_samples: 432 total_loss: 7.372 time: 1.1063 s/iter data_time: 0.0005 s/iter total_throughput: 3.62 samples/s lr: 5.00e-05 +[04/17 16:36:19] lb.utils.events INFO: eta: 3 days, 21:14:12 iteration: 108/335008 consumed_samples: 436 total_loss: 7.365 time: 1.1053 s/iter data_time: 0.0005 s/iter total_throughput: 3.62 samples/s lr: 5.00e-05 +[04/17 16:36:20] lb.utils.events INFO: eta: 3 days, 21:14:01 iteration: 109/335008 consumed_samples: 440 total_loss: 7.363 time: 1.1043 s/iter data_time: 0.0005 s/iter total_throughput: 3.62 samples/s lr: 5.00e-05 +[04/17 16:36:21] lb.utils.events INFO: eta: 3 days, 21:13:49 iteration: 110/335008 consumed_samples: 444 total_loss: 7.361 time: 1.1033 s/iter data_time: 0.0005 s/iter total_throughput: 3.63 samples/s lr: 5.00e-05 +[04/17 16:36:22] lb.utils.events INFO: eta: 3 days, 21:13:33 iteration: 111/335008 consumed_samples: 448 total_loss: 7.347 time: 1.1023 s/iter data_time: 0.0005 s/iter total_throughput: 3.63 samples/s lr: 5.00e-05 +[04/17 16:36:23] lb.utils.events INFO: eta: 3 days, 21:13:17 iteration: 112/335008 consumed_samples: 452 total_loss: 7.334 time: 1.1014 s/iter data_time: 0.0005 s/iter total_throughput: 3.63 samples/s lr: 5.00e-05 +[04/17 16:36:24] lb.utils.events INFO: eta: 3 days, 21:13:16 iteration: 113/335008 consumed_samples: 456 total_loss: 7.329 time: 1.1005 s/iter data_time: 0.0005 s/iter total_throughput: 3.63 samples/s lr: 5.00e-05 +[04/17 16:36:25] lb.utils.events INFO: eta: 3 days, 21:13:14 iteration: 114/335008 consumed_samples: 460 total_loss: 7.324 time: 1.0996 s/iter data_time: 0.0005 s/iter total_throughput: 3.64 samples/s lr: 5.00e-05 +[04/17 16:36:26] lb.utils.events INFO: eta: 3 days, 21:12:49 iteration: 115/335008 consumed_samples: 464 total_loss: 7.307 time: 1.0986 s/iter data_time: 0.0005 s/iter total_throughput: 3.64 samples/s lr: 5.00e-05 +[04/17 16:36:27] lb.utils.events INFO: eta: 3 days, 21:12:23 iteration: 116/335008 consumed_samples: 468 total_loss: 7.29 time: 1.0978 s/iter data_time: 0.0005 s/iter total_throughput: 3.64 samples/s lr: 5.00e-05 +[04/17 16:36:28] lb.utils.events INFO: eta: 3 days, 21:11:35 iteration: 117/335008 consumed_samples: 472 total_loss: 7.288 time: 1.0969 s/iter data_time: 0.0005 s/iter total_throughput: 3.65 samples/s lr: 5.00e-05 +[04/17 16:36:29] lb.utils.events INFO: eta: 3 days, 21:10:47 iteration: 118/335008 consumed_samples: 476 total_loss: 7.286 time: 1.0960 s/iter data_time: 0.0005 s/iter total_throughput: 3.65 samples/s lr: 5.00e-05 +[04/17 16:36:30] lb.utils.events INFO: eta: 3 days, 21:10:14 iteration: 119/335008 consumed_samples: 480 total_loss: 7.242 time: 1.0951 s/iter data_time: 0.0006 s/iter total_throughput: 3.65 samples/s lr: 5.00e-05 +[04/17 16:36:31] lb.utils.events INFO: eta: 3 days, 21:09:40 iteration: 120/335008 consumed_samples: 484 total_loss: 7.199 time: 1.0942 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:36:32] lb.utils.events INFO: eta: 3 days, 21:09:12 iteration: 121/335008 consumed_samples: 488 total_loss: 7.197 time: 1.0934 s/iter data_time: 0.0007 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:36:33] lb.utils.events INFO: eta: 3 days, 21:08:45 iteration: 122/335008 consumed_samples: 492 total_loss: 7.195 time: 1.0926 s/iter data_time: 0.0008 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:36:34] lb.utils.events INFO: eta: 3 days, 21:09:10 iteration: 123/335008 consumed_samples: 496 total_loss: 7.185 time: 1.0918 s/iter data_time: 0.0008 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 +[04/17 16:36:35] lb.utils.events INFO: eta: 3 days, 21:08:43 iteration: 124/335008 consumed_samples: 500 total_loss: 7.175 time: 1.0910 s/iter data_time: 0.0008 s/iter total_throughput: 3.67 samples/s lr: 5.00e-05 +[04/17 16:36:36] lb.utils.events INFO: eta: 3 days, 21:08:16 iteration: 125/335008 consumed_samples: 504 total_loss: 7.169 time: 1.0902 s/iter data_time: 0.0008 s/iter total_throughput: 3.67 samples/s lr: 5.00e-05 +[04/17 16:36:37] lb.utils.events INFO: eta: 3 days, 21:07:48 iteration: 126/335008 consumed_samples: 508 total_loss: 7.163 time: 1.0894 s/iter data_time: 0.0008 s/iter total_throughput: 3.67 samples/s lr: 5.00e-05 +[04/17 16:36:38] lb.utils.events INFO: eta: 3 days, 21:07:45 iteration: 127/335008 consumed_samples: 512 total_loss: 7.158 time: 1.0887 s/iter data_time: 0.0008 s/iter total_throughput: 3.67 samples/s lr: 5.00e-05 +[04/17 16:36:39] lb.utils.events INFO: eta: 3 days, 21:07:41 iteration: 128/335008 consumed_samples: 516 total_loss: 7.153 time: 1.0880 s/iter data_time: 0.0008 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 +[04/17 16:36:40] lb.utils.events INFO: eta: 3 days, 21:07:30 iteration: 129/335008 consumed_samples: 520 total_loss: 7.144 time: 1.0873 s/iter data_time: 0.0008 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 +[04/17 16:36:41] lb.utils.events INFO: eta: 3 days, 21:07:20 iteration: 130/335008 consumed_samples: 524 total_loss: 7.136 time: 1.0866 s/iter data_time: 0.0008 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 +[04/17 16:36:42] lb.utils.events INFO: eta: 3 days, 21:07:17 iteration: 131/335008 consumed_samples: 528 total_loss: 7.134 time: 1.0859 s/iter data_time: 0.0008 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 +[04/17 16:36:43] lb.utils.events INFO: eta: 3 days, 21:07:13 iteration: 132/335008 consumed_samples: 532 total_loss: 7.133 time: 1.0851 s/iter data_time: 0.0009 s/iter total_throughput: 3.69 samples/s lr: 5.00e-05 +[04/17 16:36:44] lb.utils.events INFO: eta: 3 days, 21:07:06 iteration: 133/335008 consumed_samples: 536 total_loss: 7.125 time: 1.0845 s/iter data_time: 0.0009 s/iter total_throughput: 3.69 samples/s lr: 5.00e-05 +[04/17 16:36:45] lb.utils.events INFO: eta: 3 days, 21:06:58 iteration: 134/335008 consumed_samples: 540 total_loss: 7.118 time: 1.0838 s/iter data_time: 0.0009 s/iter total_throughput: 3.69 samples/s lr: 5.00e-05 +[04/17 16:36:46] lb.utils.events INFO: eta: 3 days, 21:06:53 iteration: 135/335008 consumed_samples: 544 total_loss: 7.109 time: 1.0832 s/iter data_time: 0.0010 s/iter total_throughput: 3.69 samples/s lr: 5.00e-05 +[04/17 16:36:47] lb.utils.events INFO: eta: 3 days, 21:06:49 iteration: 136/335008 consumed_samples: 548 total_loss: 7.101 time: 1.0825 s/iter data_time: 0.0010 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 +[04/17 16:36:48] lb.utils.events INFO: eta: 3 days, 21:06:47 iteration: 137/335008 consumed_samples: 552 total_loss: 7.086 time: 1.0818 s/iter data_time: 0.0010 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 +[04/17 16:36:49] lb.utils.events INFO: eta: 3 days, 21:06:45 iteration: 138/335008 consumed_samples: 556 total_loss: 7.071 time: 1.0812 s/iter data_time: 0.0010 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 +[04/17 16:36:50] lb.utils.events INFO: eta: 3 days, 21:06:39 iteration: 139/335008 consumed_samples: 560 total_loss: 7.067 time: 1.0806 s/iter data_time: 0.0009 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 +[04/17 16:36:51] lb.utils.events INFO: eta: 3 days, 21:06:34 iteration: 140/335008 consumed_samples: 564 total_loss: 7.062 time: 1.0799 s/iter data_time: 0.0009 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 +[04/17 16:36:52] lb.utils.events INFO: eta: 3 days, 21:06:31 iteration: 141/335008 consumed_samples: 568 total_loss: 7.055 time: 1.0793 s/iter data_time: 0.0008 s/iter total_throughput: 3.71 samples/s lr: 5.00e-05 +[04/17 16:36:53] lb.utils.events INFO: eta: 3 days, 21:06:28 iteration: 142/335008 consumed_samples: 572 total_loss: 7.048 time: 1.0788 s/iter data_time: 0.0008 s/iter total_throughput: 3.71 samples/s lr: 5.00e-05 +[04/17 16:36:54] lb.utils.events INFO: eta: 3 days, 21:06:14 iteration: 143/335008 consumed_samples: 576 total_loss: 7.029 time: 1.0781 s/iter data_time: 0.0008 s/iter total_throughput: 3.71 samples/s lr: 5.00e-05 +[04/17 16:36:55] lb.utils.events INFO: eta: 3 days, 21:06:00 iteration: 144/335008 consumed_samples: 580 total_loss: 7.01 time: 1.0776 s/iter data_time: 0.0008 s/iter total_throughput: 3.71 samples/s lr: 5.00e-05 +[04/17 16:36:56] lb.utils.events INFO: eta: 3 days, 21:05:52 iteration: 145/335008 consumed_samples: 584 total_loss: 7.003 time: 1.0770 s/iter data_time: 0.0009 s/iter total_throughput: 3.71 samples/s lr: 5.00e-05 +[04/17 16:36:57] lb.utils.events INFO: eta: 3 days, 21:05:43 iteration: 146/335008 consumed_samples: 588 total_loss: 6.996 time: 1.0765 s/iter data_time: 0.0010 s/iter total_throughput: 3.72 samples/s lr: 5.00e-05 +[04/17 16:36:58] lb.utils.events INFO: eta: 3 days, 21:05:40 iteration: 147/335008 consumed_samples: 592 total_loss: 6.99 time: 1.0759 s/iter data_time: 0.0010 s/iter total_throughput: 3.72 samples/s lr: 5.00e-05 +[04/17 16:36:59] lb.utils.events INFO: eta: 3 days, 21:05:41 iteration: 148/335008 consumed_samples: 596 total_loss: 6.983 time: 1.0754 s/iter data_time: 0.0010 s/iter total_throughput: 3.72 samples/s lr: 5.00e-05 +[04/17 16:37:00] lb.utils.events INFO: eta: 3 days, 21:05:38 iteration: 149/335008 consumed_samples: 600 total_loss: 6.967 time: 1.0749 s/iter data_time: 0.0012 s/iter total_throughput: 3.72 samples/s lr: 5.00e-05 +[04/17 16:37:01] lb.utils.events INFO: eta: 3 days, 21:05:35 iteration: 150/335008 consumed_samples: 604 total_loss: 6.951 time: 1.0743 s/iter data_time: 0.0012 s/iter total_throughput: 3.72 samples/s lr: 5.00e-05 +[04/17 16:37:02] lb.utils.events INFO: eta: 3 days, 21:05:27 iteration: 151/335008 consumed_samples: 608 total_loss: 6.947 time: 1.0738 s/iter data_time: 0.0012 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 +[04/17 16:37:03] lb.utils.events INFO: eta: 3 days, 21:05:20 iteration: 152/335008 consumed_samples: 612 total_loss: 6.942 time: 1.0733 s/iter data_time: 0.0010 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 +[04/17 16:37:04] lb.utils.events INFO: eta: 3 days, 21:05:00 iteration: 153/335008 consumed_samples: 616 total_loss: 6.924 time: 1.0727 s/iter data_time: 0.0010 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 +[04/17 16:37:05] lb.utils.events INFO: eta: 3 days, 21:04:40 iteration: 154/335008 consumed_samples: 620 total_loss: 6.905 time: 1.0722 s/iter data_time: 0.0010 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 +[04/17 16:37:06] lb.utils.events INFO: eta: 3 days, 21:04:20 iteration: 155/335008 consumed_samples: 624 total_loss: 6.902 time: 1.0717 s/iter data_time: 0.0010 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 +[04/17 16:37:07] lb.utils.events INFO: eta: 3 days, 21:04:00 iteration: 156/335008 consumed_samples: 628 total_loss: 6.899 time: 1.0712 s/iter data_time: 0.0009 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 +[04/17 16:37:08] lb.utils.events INFO: eta: 3 days, 21:04:18 iteration: 157/335008 consumed_samples: 632 total_loss: 6.897 time: 1.0708 s/iter data_time: 0.0009 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 +[04/17 16:37:09] lb.utils.events INFO: eta: 3 days, 21:03:58 iteration: 158/335008 consumed_samples: 636 total_loss: 6.895 time: 1.0703 s/iter data_time: 0.0009 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 +[04/17 16:37:10] lb.utils.events INFO: eta: 3 days, 21:03:54 iteration: 159/335008 consumed_samples: 640 total_loss: 6.886 time: 1.0698 s/iter data_time: 0.0010 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 +[04/17 16:37:11] lb.utils.events INFO: eta: 3 days, 21:03:49 iteration: 160/335008 consumed_samples: 644 total_loss: 6.877 time: 1.0694 s/iter data_time: 0.0011 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 +[04/17 16:37:12] lb.utils.events INFO: eta: 3 days, 21:03:40 iteration: 161/335008 consumed_samples: 648 total_loss: 6.848 time: 1.0689 s/iter data_time: 0.0011 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 +[04/17 16:37:13] lb.utils.events INFO: eta: 3 days, 21:03:30 iteration: 162/335008 consumed_samples: 652 total_loss: 6.82 time: 1.0685 s/iter data_time: 0.0012 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 +[04/17 16:37:14] lb.utils.events INFO: eta: 3 days, 21:03:38 iteration: 163/335008 consumed_samples: 656 total_loss: 6.817 time: 1.0681 s/iter data_time: 0.0012 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 +[04/17 16:37:15] lb.utils.events INFO: eta: 3 days, 21:03:28 iteration: 164/335008 consumed_samples: 660 total_loss: 6.815 time: 1.0676 s/iter data_time: 0.0012 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 +[04/17 16:37:16] lb.utils.events INFO: eta: 3 days, 21:03:25 iteration: 165/335008 consumed_samples: 664 total_loss: 6.812 time: 1.0672 s/iter data_time: 0.0011 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 +[04/17 16:37:17] lb.utils.events INFO: eta: 3 days, 21:03:23 iteration: 166/335008 consumed_samples: 668 total_loss: 6.809 time: 1.0667 s/iter data_time: 0.0010 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 +[04/17 16:37:18] lb.utils.events INFO: eta: 3 days, 21:03:23 iteration: 167/335008 consumed_samples: 672 total_loss: 6.809 time: 1.0664 s/iter data_time: 0.0010 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 +[04/17 16:37:19] lb.utils.events INFO: eta: 3 days, 21:03:21 iteration: 168/335008 consumed_samples: 676 total_loss: 6.808 time: 1.0659 s/iter data_time: 0.0009 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 +[04/17 16:37:20] lb.utils.events INFO: eta: 3 days, 21:03:10 iteration: 169/335008 consumed_samples: 680 total_loss: 6.808 time: 1.0655 s/iter data_time: 0.0008 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 +[04/17 16:37:21] lb.utils.events INFO: eta: 3 days, 21:02:58 iteration: 170/335008 consumed_samples: 684 total_loss: 6.807 time: 1.0651 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 +[04/17 16:37:22] lb.utils.events INFO: eta: 3 days, 21:03:08 iteration: 171/335008 consumed_samples: 688 total_loss: 6.804 time: 1.0647 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 +[04/17 16:37:23] lb.utils.events INFO: eta: 3 days, 21:02:56 iteration: 172/335008 consumed_samples: 692 total_loss: 6.801 time: 1.0644 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 +[04/17 16:37:24] lb.utils.events INFO: eta: 3 days, 21:02:51 iteration: 173/335008 consumed_samples: 696 total_loss: 6.796 time: 1.0640 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 +[04/17 16:37:25] lb.utils.events INFO: eta: 3 days, 21:02:54 iteration: 174/335008 consumed_samples: 700 total_loss: 6.792 time: 1.0636 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 +[04/17 16:37:26] lb.utils.events INFO: eta: 3 days, 21:03:04 iteration: 175/335008 consumed_samples: 704 total_loss: 6.792 time: 1.0633 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 +[04/17 16:37:27] lb.utils.events INFO: eta: 3 days, 21:03:13 iteration: 176/335008 consumed_samples: 708 total_loss: 6.791 time: 1.0629 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 +[04/17 16:37:28] lb.utils.events INFO: eta: 3 days, 21:03:02 iteration: 177/335008 consumed_samples: 712 total_loss: 6.787 time: 1.0625 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 +[04/17 16:37:29] lb.utils.events INFO: eta: 3 days, 21:02:50 iteration: 178/335008 consumed_samples: 716 total_loss: 6.782 time: 1.0622 s/iter data_time: 0.0008 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 +[04/17 16:37:30] lb.utils.events INFO: eta: 3 days, 21:03:00 iteration: 179/335008 consumed_samples: 720 total_loss: 6.782 time: 1.0618 s/iter data_time: 0.0007 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 +[04/17 16:37:31] lb.utils.events INFO: eta: 3 days, 21:02:48 iteration: 180/335008 consumed_samples: 724 total_loss: 6.781 time: 1.0615 s/iter data_time: 0.0007 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 +[04/17 16:37:32] lb.utils.events INFO: eta: 3 days, 21:02:43 iteration: 181/335008 consumed_samples: 728 total_loss: 6.778 time: 1.0611 s/iter data_time: 0.0006 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 +[04/17 16:37:33] lb.utils.events INFO: eta: 3 days, 21:02:37 iteration: 182/335008 consumed_samples: 732 total_loss: 6.774 time: 1.0608 s/iter data_time: 0.0006 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 +[04/17 16:37:34] lb.utils.events INFO: eta: 3 days, 21:02:26 iteration: 183/335008 consumed_samples: 736 total_loss: 6.771 time: 1.0604 s/iter data_time: 0.0006 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 +[04/17 16:37:35] lb.utils.events INFO: eta: 3 days, 21:02:16 iteration: 184/335008 consumed_samples: 740 total_loss: 6.768 time: 1.0601 s/iter data_time: 0.0005 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 +[04/17 16:37:36] lb.utils.events INFO: eta: 3 days, 21:02:11 iteration: 185/335008 consumed_samples: 744 total_loss: 6.766 time: 1.0597 s/iter data_time: 0.0006 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 +[04/17 16:37:37] lb.utils.events INFO: eta: 3 days, 21:02:06 iteration: 186/335008 consumed_samples: 748 total_loss: 6.764 time: 1.0593 s/iter data_time: 0.0006 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 +[04/17 16:37:38] lb.utils.events INFO: eta: 3 days, 21:01:57 iteration: 187/335008 consumed_samples: 752 total_loss: 6.762 time: 1.0590 s/iter data_time: 0.0007 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 +[04/17 16:37:39] lb.utils.events INFO: eta: 3 days, 21:01:49 iteration: 188/335008 consumed_samples: 756 total_loss: 6.76 time: 1.0586 s/iter data_time: 0.0007 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 +[04/17 16:37:40] lb.utils.events INFO: eta: 3 days, 21:01:38 iteration: 189/335008 consumed_samples: 760 total_loss: 6.758 time: 1.0583 s/iter data_time: 0.0007 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 +[04/17 16:37:41] lb.utils.events INFO: eta: 3 days, 21:01:27 iteration: 190/335008 consumed_samples: 764 total_loss: 6.755 time: 1.0579 s/iter data_time: 0.0007 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 +[04/17 16:37:42] lb.utils.events INFO: eta: 3 days, 21:01:02 iteration: 191/335008 consumed_samples: 768 total_loss: 6.753 time: 1.0576 s/iter data_time: 0.0007 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 +[04/17 16:37:43] lb.utils.events INFO: eta: 3 days, 21:00:36 iteration: 192/335008 consumed_samples: 772 total_loss: 6.751 time: 1.0573 s/iter data_time: 0.0009 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 +[04/17 16:37:44] lb.utils.events INFO: eta: 3 days, 21:00:26 iteration: 193/335008 consumed_samples: 776 total_loss: 6.746 time: 1.0569 s/iter data_time: 0.0009 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 +[04/17 16:37:45] lb.utils.events INFO: eta: 3 days, 21:00:16 iteration: 194/335008 consumed_samples: 780 total_loss: 6.741 time: 1.0566 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 +[04/17 16:37:46] lb.utils.events INFO: eta: 3 days, 21:00:09 iteration: 195/335008 consumed_samples: 784 total_loss: 6.738 time: 1.0563 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 +[04/17 16:37:47] lb.utils.events INFO: eta: 3 days, 21:00:02 iteration: 196/335008 consumed_samples: 788 total_loss: 6.735 time: 1.0559 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 +[04/17 16:37:48] lb.utils.events INFO: eta: 3 days, 20:59:56 iteration: 197/335008 consumed_samples: 792 total_loss: 6.73 time: 1.0556 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 +[04/17 16:37:49] lb.utils.events INFO: eta: 3 days, 20:59:50 iteration: 198/335008 consumed_samples: 796 total_loss: 6.726 time: 1.0553 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 +[04/17 16:37:50] lb.utils.events INFO: eta: 3 days, 20:59:48 iteration: 199/335008 consumed_samples: 800 total_loss: 6.724 time: 1.0550 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 +[04/17 16:37:51] lb.utils.events INFO: eta: 3 days, 20:59:47 iteration: 200/335008 consumed_samples: 804 total_loss: 6.719 time: 1.0547 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 +[04/17 16:37:52] lb.utils.events INFO: eta: 3 days, 20:59:34 iteration: 201/335008 consumed_samples: 808 total_loss: 6.716 time: 1.0544 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 +[04/17 16:37:53] lb.utils.events INFO: eta: 3 days, 20:59:22 iteration: 202/335008 consumed_samples: 812 total_loss: 6.715 time: 1.0541 s/iter data_time: 0.0010 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 +[04/17 16:37:54] lb.utils.events INFO: eta: 3 days, 20:58:52 iteration: 203/335008 consumed_samples: 816 total_loss: 6.713 time: 1.0537 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 +[04/17 16:37:55] lb.utils.events INFO: eta: 3 days, 20:58:22 iteration: 204/335008 consumed_samples: 820 total_loss: 6.712 time: 1.0535 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 +[04/17 16:37:56] lb.utils.events INFO: eta: 3 days, 20:58:10 iteration: 205/335008 consumed_samples: 824 total_loss: 6.706 time: 1.0532 s/iter data_time: 0.0011 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 +[04/17 16:37:57] lb.utils.events INFO: eta: 3 days, 20:57:59 iteration: 206/335008 consumed_samples: 828 total_loss: 6.698 time: 1.0529 s/iter data_time: 0.0011 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 +[04/17 16:37:58] lb.utils.events INFO: eta: 3 days, 20:57:25 iteration: 207/335008 consumed_samples: 832 total_loss: 6.695 time: 1.0526 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 +[04/17 16:37:59] lb.utils.events INFO: eta: 3 days, 20:56:52 iteration: 208/335008 consumed_samples: 836 total_loss: 6.692 time: 1.0523 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 +[04/17 16:38:00] lb.utils.events INFO: eta: 3 days, 20:56:57 iteration: 209/335008 consumed_samples: 840 total_loss: 6.687 time: 1.0521 s/iter data_time: 0.0011 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 +[04/17 16:38:01] lb.utils.events INFO: eta: 3 days, 20:56:50 iteration: 210/335008 consumed_samples: 844 total_loss: 6.683 time: 1.0518 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 +[04/17 16:38:02] lb.utils.events INFO: eta: 3 days, 20:56:38 iteration: 211/335008 consumed_samples: 848 total_loss: 6.682 time: 1.0515 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 +[04/17 16:38:03] lb.utils.events INFO: eta: 3 days, 20:56:26 iteration: 212/335008 consumed_samples: 852 total_loss: 6.681 time: 1.0512 s/iter data_time: 0.0009 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:38:04] lb.utils.events INFO: eta: 3 days, 20:55:44 iteration: 213/335008 consumed_samples: 856 total_loss: 6.676 time: 1.0510 s/iter data_time: 0.0009 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:38:05] lb.utils.events INFO: eta: 3 days, 20:55:02 iteration: 214/335008 consumed_samples: 860 total_loss: 6.671 time: 1.0507 s/iter data_time: 0.0009 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:38:06] lb.utils.events INFO: eta: 3 days, 20:54:50 iteration: 215/335008 consumed_samples: 864 total_loss: 6.659 time: 1.0504 s/iter data_time: 0.0009 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:38:07] lb.utils.events INFO: eta: 3 days, 20:54:38 iteration: 216/335008 consumed_samples: 868 total_loss: 6.647 time: 1.0502 s/iter data_time: 0.0010 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:38:08] lb.utils.events INFO: eta: 3 days, 20:54:36 iteration: 217/335008 consumed_samples: 872 total_loss: 6.645 time: 1.0499 s/iter data_time: 0.0010 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:38:09] lb.utils.events INFO: eta: 3 days, 20:54:34 iteration: 218/335008 consumed_samples: 876 total_loss: 6.643 time: 1.0496 s/iter data_time: 0.0011 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:38:10] lb.utils.events INFO: eta: 3 days, 20:54:28 iteration: 219/335008 consumed_samples: 880 total_loss: 6.638 time: 1.0494 s/iter data_time: 0.0012 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:38:11] lb.utils.events INFO: eta: 3 days, 20:54:21 iteration: 220/335008 consumed_samples: 884 total_loss: 6.632 time: 1.0491 s/iter data_time: 0.0012 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:38:12] lb.utils.events INFO: eta: 3 days, 20:54:20 iteration: 221/335008 consumed_samples: 888 total_loss: 6.63 time: 1.0489 s/iter data_time: 0.0012 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:38:13] lb.utils.events INFO: eta: 3 days, 20:54:19 iteration: 222/335008 consumed_samples: 892 total_loss: 6.626 time: 1.0486 s/iter data_time: 0.0012 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 +[04/17 16:38:14] lb.utils.events INFO: eta: 3 days, 20:54:16 iteration: 223/335008 consumed_samples: 896 total_loss: 6.626 time: 1.0483 s/iter data_time: 0.0012 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 +[04/17 16:38:15] lb.utils.events INFO: eta: 3 days, 20:54:14 iteration: 224/335008 consumed_samples: 900 total_loss: 6.623 time: 1.0481 s/iter data_time: 0.0012 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 +[04/17 16:38:16] lb.utils.events INFO: eta: 3 days, 20:53:55 iteration: 225/335008 consumed_samples: 904 total_loss: 6.618 time: 1.0478 s/iter data_time: 0.0012 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 +[04/17 16:38:17] lb.utils.events INFO: eta: 3 days, 20:53:35 iteration: 226/335008 consumed_samples: 908 total_loss: 6.615 time: 1.0476 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 +[04/17 16:38:18] lb.utils.events INFO: eta: 3 days, 20:53:26 iteration: 227/335008 consumed_samples: 912 total_loss: 6.61 time: 1.0473 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 +[04/17 16:38:19] lb.utils.events INFO: eta: 3 days, 20:53:17 iteration: 228/335008 consumed_samples: 916 total_loss: 6.605 time: 1.0471 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 +[04/17 16:38:20] lb.utils.events INFO: eta: 3 days, 20:52:57 iteration: 229/335008 consumed_samples: 920 total_loss: 6.602 time: 1.0469 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 +[04/17 16:38:21] lb.utils.events INFO: eta: 3 days, 20:52:37 iteration: 230/335008 consumed_samples: 924 total_loss: 6.601 time: 1.0466 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 +[04/17 16:38:22] lb.utils.events INFO: eta: 3 days, 20:52:28 iteration: 231/335008 consumed_samples: 928 total_loss: 6.598 time: 1.0464 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 +[04/17 16:38:23] lb.utils.events INFO: eta: 3 days, 20:52:20 iteration: 232/335008 consumed_samples: 932 total_loss: 6.594 time: 1.0462 s/iter data_time: 0.0012 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 +[04/17 16:38:24] lb.utils.events INFO: eta: 3 days, 20:52:07 iteration: 233/335008 consumed_samples: 936 total_loss: 6.594 time: 1.0459 s/iter data_time: 0.0012 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 +[04/17 16:38:25] lb.utils.events INFO: eta: 3 days, 20:51:54 iteration: 234/335008 consumed_samples: 940 total_loss: 6.59 time: 1.0457 s/iter data_time: 0.0012 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:26] lb.utils.events INFO: eta: 3 days, 20:51:52 iteration: 235/335008 consumed_samples: 944 total_loss: 6.586 time: 1.0455 s/iter data_time: 0.0013 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:27] lb.utils.events INFO: eta: 3 days, 20:51:49 iteration: 236/335008 consumed_samples: 948 total_loss: 6.584 time: 1.0452 s/iter data_time: 0.0013 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:28] lb.utils.events INFO: eta: 3 days, 20:51:31 iteration: 237/335008 consumed_samples: 952 total_loss: 6.581 time: 1.0450 s/iter data_time: 0.0013 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:29] lb.utils.events INFO: eta: 3 days, 20:51:12 iteration: 238/335008 consumed_samples: 956 total_loss: 6.579 time: 1.0448 s/iter data_time: 0.0013 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:30] lb.utils.events INFO: eta: 3 days, 20:50:47 iteration: 239/335008 consumed_samples: 960 total_loss: 6.576 time: 1.0446 s/iter data_time: 0.0011 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:31] lb.utils.events INFO: eta: 3 days, 20:51:10 iteration: 240/335008 consumed_samples: 964 total_loss: 6.572 time: 1.0444 s/iter data_time: 0.0011 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:32] lb.utils.events INFO: eta: 3 days, 20:51:27 iteration: 241/335008 consumed_samples: 968 total_loss: 6.568 time: 1.0442 s/iter data_time: 0.0011 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:33] lb.utils.events INFO: eta: 3 days, 20:51:43 iteration: 242/335008 consumed_samples: 972 total_loss: 6.568 time: 1.0441 s/iter data_time: 0.0011 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:34] lb.utils.events INFO: eta: 3 days, 20:51:44 iteration: 243/335008 consumed_samples: 976 total_loss: 6.567 time: 1.0439 s/iter data_time: 0.0010 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:35] lb.utils.events INFO: eta: 3 days, 20:51:44 iteration: 244/335008 consumed_samples: 980 total_loss: 6.566 time: 1.0437 s/iter data_time: 0.0010 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:36] lb.utils.events INFO: eta: 3 days, 20:51:55 iteration: 245/335008 consumed_samples: 984 total_loss: 6.565 time: 1.0435 s/iter data_time: 0.0009 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:37] lb.utils.events INFO: eta: 3 days, 20:52:06 iteration: 246/335008 consumed_samples: 988 total_loss: 6.565 time: 1.0434 s/iter data_time: 0.0009 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:38] lb.utils.events INFO: eta: 3 days, 20:52:12 iteration: 247/335008 consumed_samples: 992 total_loss: 6.563 time: 1.0432 s/iter data_time: 0.0009 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:39] lb.utils.events INFO: eta: 3 days, 20:52:19 iteration: 248/335008 consumed_samples: 996 total_loss: 6.56 time: 1.0430 s/iter data_time: 0.0009 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 +[04/17 16:38:40] lb.utils.events INFO: eta: 3 days, 20:52:37 iteration: 249/335008 consumed_samples: 1000 total_loss: 6.556 time: 1.0429 s/iter data_time: 0.0009 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:41] lb.utils.events INFO: eta: 3 days, 20:52:17 iteration: 250/335008 consumed_samples: 1004 total_loss: 6.553 time: 1.0427 s/iter data_time: 0.0008 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:42] lb.utils.events INFO: eta: 3 days, 20:52:08 iteration: 251/335008 consumed_samples: 1008 total_loss: 6.549 time: 1.0425 s/iter data_time: 0.0008 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:43] lb.utils.events INFO: eta: 3 days, 20:52:15 iteration: 252/335008 consumed_samples: 1012 total_loss: 6.546 time: 1.0424 s/iter data_time: 0.0008 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:44] lb.utils.events INFO: eta: 3 days, 20:52:06 iteration: 253/335008 consumed_samples: 1016 total_loss: 6.545 time: 1.0422 s/iter data_time: 0.0008 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:45] lb.utils.events INFO: eta: 3 days, 20:52:13 iteration: 254/335008 consumed_samples: 1020 total_loss: 6.545 time: 1.0420 s/iter data_time: 0.0008 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:46] lb.utils.events INFO: eta: 3 days, 20:52:31 iteration: 255/335008 consumed_samples: 1024 total_loss: 6.545 time: 1.0419 s/iter data_time: 0.0007 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:47] lb.utils.events INFO: eta: 3 days, 20:52:11 iteration: 256/335008 consumed_samples: 1028 total_loss: 6.545 time: 1.0417 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:48] lb.utils.events INFO: eta: 3 days, 20:52:02 iteration: 257/335008 consumed_samples: 1032 total_loss: 6.545 time: 1.0415 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:49] lb.utils.events INFO: eta: 3 days, 20:51:54 iteration: 258/335008 consumed_samples: 1036 total_loss: 6.545 time: 1.0413 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:50] lb.utils.events INFO: eta: 3 days, 20:51:48 iteration: 259/335008 consumed_samples: 1040 total_loss: 6.545 time: 1.0412 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:51] lb.utils.events INFO: eta: 3 days, 20:51:43 iteration: 260/335008 consumed_samples: 1044 total_loss: 6.545 time: 1.0410 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:52] lb.utils.events INFO: eta: 3 days, 20:51:34 iteration: 261/335008 consumed_samples: 1048 total_loss: 6.545 time: 1.0408 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:53] lb.utils.events INFO: eta: 3 days, 20:51:26 iteration: 262/335008 consumed_samples: 1052 total_loss: 6.545 time: 1.0406 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:54] lb.utils.events INFO: eta: 3 days, 20:51:24 iteration: 263/335008 consumed_samples: 1056 total_loss: 6.545 time: 1.0404 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 +[04/17 16:38:55] lb.utils.events INFO: eta: 3 days, 20:51:21 iteration: 264/335008 consumed_samples: 1060 total_loss: 6.545 time: 1.0402 s/iter data_time: 0.0006 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:38:56] lb.utils.events INFO: eta: 3 days, 20:51:03 iteration: 265/335008 consumed_samples: 1064 total_loss: 6.545 time: 1.0400 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:38:57] lb.utils.events INFO: eta: 3 days, 20:50:44 iteration: 266/335008 consumed_samples: 1068 total_loss: 6.545 time: 1.0398 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:38:58] lb.utils.events INFO: eta: 3 days, 20:50:19 iteration: 267/335008 consumed_samples: 1072 total_loss: 6.545 time: 1.0397 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:38:59] lb.utils.events INFO: eta: 3 days, 20:49:54 iteration: 268/335008 consumed_samples: 1076 total_loss: 6.543 time: 1.0395 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:00] lb.utils.events INFO: eta: 3 days, 20:49:51 iteration: 269/335008 consumed_samples: 1080 total_loss: 6.54 time: 1.0393 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:01] lb.utils.events INFO: eta: 3 days, 20:49:47 iteration: 270/335008 consumed_samples: 1084 total_loss: 6.537 time: 1.0391 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:02] lb.utils.events INFO: eta: 3 days, 20:49:40 iteration: 271/335008 consumed_samples: 1088 total_loss: 6.537 time: 1.0390 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:03] lb.utils.events INFO: eta: 3 days, 20:49:32 iteration: 272/335008 consumed_samples: 1092 total_loss: 6.535 time: 1.0388 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:04] lb.utils.events INFO: eta: 3 days, 20:49:26 iteration: 273/335008 consumed_samples: 1096 total_loss: 6.533 time: 1.0386 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:05] lb.utils.events INFO: eta: 3 days, 20:49:20 iteration: 274/335008 consumed_samples: 1100 total_loss: 6.533 time: 1.0385 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:06] lb.utils.events INFO: eta: 3 days, 20:49:14 iteration: 275/335008 consumed_samples: 1104 total_loss: 6.531 time: 1.0383 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:07] lb.utils.events INFO: eta: 3 days, 20:49:18 iteration: 276/335008 consumed_samples: 1108 total_loss: 6.529 time: 1.0382 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:08] lb.utils.events INFO: eta: 3 days, 20:49:12 iteration: 277/335008 consumed_samples: 1112 total_loss: 6.528 time: 1.0380 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:09] lb.utils.events INFO: eta: 3 days, 20:49:06 iteration: 278/335008 consumed_samples: 1116 total_loss: 6.528 time: 1.0379 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:10] lb.utils.events INFO: eta: 3 days, 20:49:04 iteration: 279/335008 consumed_samples: 1120 total_loss: 6.526 time: 1.0377 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 +[04/17 16:39:11] lb.utils.events INFO: eta: 3 days, 20:49:04 iteration: 280/335008 consumed_samples: 1124 total_loss: 6.524 time: 1.0376 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:12] lb.utils.events INFO: eta: 3 days, 20:49:02 iteration: 281/335008 consumed_samples: 1128 total_loss: 6.522 time: 1.0374 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:13] lb.utils.events INFO: eta: 3 days, 20:49:02 iteration: 282/335008 consumed_samples: 1132 total_loss: 6.519 time: 1.0373 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:14] lb.utils.events INFO: eta: 3 days, 20:49:00 iteration: 283/335008 consumed_samples: 1136 total_loss: 6.517 time: 1.0372 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:15] lb.utils.events INFO: eta: 3 days, 20:48:59 iteration: 284/335008 consumed_samples: 1140 total_loss: 6.516 time: 1.0370 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:16] lb.utils.events INFO: eta: 3 days, 20:48:31 iteration: 285/335008 consumed_samples: 1144 total_loss: 6.516 time: 1.0368 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:17] lb.utils.events INFO: eta: 3 days, 20:48:02 iteration: 286/335008 consumed_samples: 1148 total_loss: 6.514 time: 1.0367 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:18] lb.utils.events INFO: eta: 3 days, 20:47:58 iteration: 287/335008 consumed_samples: 1152 total_loss: 6.513 time: 1.0365 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:19] lb.utils.events INFO: eta: 3 days, 20:47:54 iteration: 288/335008 consumed_samples: 1156 total_loss: 6.512 time: 1.0364 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:20] lb.utils.events INFO: eta: 3 days, 20:47:41 iteration: 289/335008 consumed_samples: 1160 total_loss: 6.512 time: 1.0362 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:21] lb.utils.events INFO: eta: 3 days, 20:47:28 iteration: 290/335008 consumed_samples: 1164 total_loss: 6.51 time: 1.0361 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:22] lb.utils.events INFO: eta: 3 days, 20:47:39 iteration: 291/335008 consumed_samples: 1168 total_loss: 6.51 time: 1.0360 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:23] lb.utils.events INFO: eta: 3 days, 20:47:26 iteration: 292/335008 consumed_samples: 1172 total_loss: 6.507 time: 1.0358 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:24] lb.utils.events INFO: eta: 3 days, 20:47:37 iteration: 293/335008 consumed_samples: 1176 total_loss: 6.504 time: 1.0357 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:25] lb.utils.events INFO: eta: 3 days, 20:47:24 iteration: 294/335008 consumed_samples: 1180 total_loss: 6.502 time: 1.0356 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:26] lb.utils.events INFO: eta: 3 days, 20:46:59 iteration: 295/335008 consumed_samples: 1184 total_loss: 6.498 time: 1.0354 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:27] lb.utils.events INFO: eta: 3 days, 20:46:33 iteration: 296/335008 consumed_samples: 1188 total_loss: 6.496 time: 1.0353 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:28] lb.utils.events INFO: eta: 3 days, 20:46:25 iteration: 297/335008 consumed_samples: 1192 total_loss: 6.495 time: 1.0351 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:29] lb.utils.events INFO: eta: 3 days, 20:46:18 iteration: 298/335008 consumed_samples: 1196 total_loss: 6.494 time: 1.0350 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 +[04/17 16:39:30] lb.utils.events INFO: eta: 3 days, 20:45:47 iteration: 299/335008 consumed_samples: 1200 total_loss: 6.493 time: 1.0349 s/iter data_time: 0.0008 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:31] lb.utils.events INFO: eta: 3 days, 20:45:17 iteration: 300/335008 consumed_samples: 1204 total_loss: 6.491 time: 1.0347 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:32] lb.utils.events INFO: eta: 3 days, 20:45:06 iteration: 301/335008 consumed_samples: 1208 total_loss: 6.488 time: 1.0346 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:33] lb.utils.events INFO: eta: 3 days, 20:44:55 iteration: 302/335008 consumed_samples: 1212 total_loss: 6.488 time: 1.0344 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:34] lb.utils.events INFO: eta: 3 days, 20:44:39 iteration: 303/335008 consumed_samples: 1216 total_loss: 6.486 time: 1.0343 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:35] lb.utils.events INFO: eta: 3 days, 20:44:23 iteration: 304/335008 consumed_samples: 1220 total_loss: 6.483 time: 1.0342 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:36] lb.utils.events INFO: eta: 3 days, 20:44:05 iteration: 305/335008 consumed_samples: 1224 total_loss: 6.483 time: 1.0340 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:37] lb.utils.events INFO: eta: 3 days, 20:43:48 iteration: 306/335008 consumed_samples: 1228 total_loss: 6.483 time: 1.0339 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:38] lb.utils.events INFO: eta: 3 days, 20:43:40 iteration: 307/335008 consumed_samples: 1232 total_loss: 6.482 time: 1.0338 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:39] lb.utils.events INFO: eta: 3 days, 20:43:33 iteration: 308/335008 consumed_samples: 1236 total_loss: 6.479 time: 1.0336 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:40] lb.utils.events INFO: eta: 3 days, 20:43:11 iteration: 309/335008 consumed_samples: 1240 total_loss: 6.477 time: 1.0335 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:41] lb.utils.events INFO: eta: 3 days, 20:42:50 iteration: 310/335008 consumed_samples: 1244 total_loss: 6.475 time: 1.0334 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:42] lb.utils.events INFO: eta: 3 days, 20:43:09 iteration: 311/335008 consumed_samples: 1248 total_loss: 6.475 time: 1.0333 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:43] lb.utils.events INFO: eta: 3 days, 20:42:48 iteration: 312/335008 consumed_samples: 1252 total_loss: 6.473 time: 1.0332 s/iter data_time: 0.0012 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 +[04/17 16:39:44] lb.utils.events INFO: eta: 3 days, 20:42:24 iteration: 313/335008 consumed_samples: 1256 total_loss: 6.472 time: 1.0330 s/iter data_time: 0.0012 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 diff --git a/projects/MagicPrompt/oneflow_magicprompt/metrics.json b/projects/MagicPrompt/oneflow_magicprompt/metrics.json new file mode 100644 index 000000000..b8092c8f1 --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/metrics.json @@ -0,0 +1,325 @@ +{"data_time": 0.0013666469603776932, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} +{"data_time": 0.000827386393211782, "iteration": 1, "lr": 5e-05, "total_loss": 10.084523677825928} +{"data_time": 0.0012000647839158773, "eta_seconds": 364594.4456033595, "iteration": 2, "lr": 5e-05, "time": 1.088325385004282, "total_loss": 9.451910972595215} +{"data_time": 0.0007894893642514944, "eta_seconds": 365280.9437548164, "iteration": 3, "lr": 5e-05, "time": 1.0903778574429452, "total_loss": 9.38024377822876} +{"data_time": 0.0005307169631123543, "eta_seconds": 365967.4378013285, "iteration": 4, "lr": 5e-05, "time": 1.0924303298816085, "total_loss": 9.308576583862305} +{"data_time": 0.0004548154538497329, "eta_seconds": 365482.2507692026, "iteration": 5, "lr": 5e-05, "time": 1.090985279995948, "total_loss": 9.213207244873047} +{"data_time": 0.0005307169631123543, "eta_seconds": 365965.2529406687, "iteration": 6, "lr": 5e-05, "time": 1.0924303298816085, "total_loss": 9.117837905883789} +{"data_time": 0.0004548154538497329, "eta_seconds": 365674.8473155312, "iteration": 7, "lr": 5e-05, "time": 1.0915667084045708, "total_loss": 9.096522331237793} +{"data_time": 0.0005105519667267799, "eta_seconds": 365568.93536252226, "iteration": 8, "lr": 5e-05, "time": 1.091253810795024, "total_loss": 9.075206756591797} +{"data_time": 0.0004516154294833541, "eta_seconds": 365764.9098791953, "iteration": 9, "lr": 5e-05, "time": 1.0918420703383163, "total_loss": 8.991514682769775} +{"data_time": 0.0005105519667267799, "eta_seconds": 365960.8832193492, "iteration": 10, "lr": 5e-05, "time": 1.0924303298816085, "total_loss": 8.907822608947754} +{"data_time": 0.001127143856137991, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} +{"data_time": 0.0006743174744769931, "iteration": 1, "lr": 5e-05, "total_loss": 10.084524154663086} +{"data_time": 0.001127143856137991, "eta_seconds": 335975.72989138076, "iteration": 2, "lr": 5e-05, "time": 1.0028976579196751, "total_loss": 9.451911926269531} +{"data_time": 0.0006837879773229361, "eta_seconds": 335840.85186305596, "iteration": 3, "lr": 5e-05, "time": 1.0024980354355648, "total_loss": 9.380244731903076} +{"data_time": 0.00024043209850788116, "eta_seconds": 335705.9746339761, "iteration": 4, "lr": 5e-05, "time": 1.0020984129514545, "total_loss": 9.308577537536621} +{"data_time": 0.0002501021372154355, "eta_seconds": 335838.8468669851, "iteration": 5, "lr": 5e-05, "time": 1.0024980354355648, "total_loss": 9.213211059570312} +{"data_time": 0.00025977217592298985, "eta_seconds": 335703.9704371502, "iteration": 6, "lr": 5e-05, "time": 1.0020984129514545, "total_loss": 9.117844581604004} +{"data_time": 0.0002641481114551425, "eta_seconds": 335524.2486897623, "iteration": 7, "lr": 5e-05, "time": 1.001564921461977, "total_loss": 9.096528053283691} +{"data_time": 0.00026852404698729515, "eta_seconds": 335492.6444796773, "iteration": 8, "lr": 5e-05, "time": 1.0014735700096935, "total_loss": 9.075211524963379} +{"data_time": 0.0003207095433026552, "eta_seconds": 335596.30357400933, "iteration": 9, "lr": 5e-05, "time": 1.001785991480574, "total_loss": 8.991517066955566} +{"data_time": 0.0003728950396180153, "eta_seconds": 335699.9620434984, "iteration": 10, "lr": 5e-05, "time": 1.0020984129514545, "total_loss": 8.907822608947754} +{"data_time": 0.00037450052332133055, "eta_seconds": 335832.8318787725, "iteration": 11, "lr": 5e-05, "time": 1.0024980354355648, "total_loss": 8.860908031463623} +{"data_time": 0.0003761060070246458, "eta_seconds": 335965.70091480156, "iteration": 12, "lr": 5e-05, "time": 1.0028976579196751, "total_loss": 8.813993453979492} +{"data_time": 0.00043159653432667255, "eta_seconds": 335984.5827728254, "iteration": 13, "lr": 5e-05, "time": 1.0029570164624602, "total_loss": 8.797540187835693} +{"data_time": 0.00041356286965310574, "eta_seconds": 335975.9287358192, "iteration": 14, "lr": 5e-05, "time": 1.002934176940471, "total_loss": 8.781086921691895} +{"data_time": 0.00041403493378311396, "eta_seconds": 335968.809011735, "iteration": 15, "lr": 5e-05, "time": 1.002915917430073, "total_loss": 8.74678087234497} +{"data_time": 0.0004145069979131222, "eta_seconds": 335973.9228674653, "iteration": 16, "lr": 5e-05, "time": 1.002934176940471, "total_loss": 8.712474822998047} +{"data_time": 0.00045079702977091074, "eta_seconds": 335966.80317990016, "iteration": 17, "lr": 5e-05, "time": 1.002915917430073, "total_loss": 8.699904918670654} +{"data_time": 0.0004145069979131222, "eta_seconds": 335971.9169991114, "iteration": 18, "lr": 5e-05, "time": 1.002934176940471, "total_loss": 8.687335014343262} +{"data_time": 0.00045079702977091074, "eta_seconds": 335984.6817475958, "iteration": 19, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.670325756072998} +{"data_time": 0.00041403493378311396, "eta_seconds": 335969.91113075754, "iteration": 20, "lr": 5e-05, "time": 1.002934176940471, "total_loss": 8.572019577026367} +{"data_time": 0.00041403493378311396, "eta_seconds": 335982.67579704383, "iteration": 21, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.490070819854736} +{"data_time": 0.00041403493378311396, "eta_seconds": 335967.90526240366, "iteration": 22, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.478554248809814} +{"data_time": 0.0004425774095579982, "eta_seconds": 335960.7856843956, "iteration": 23, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.46279764175415} +{"data_time": 0.00047843088395893574, "eta_seconds": 335953.6661429065, "iteration": 24, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.406514644622803} +{"data_time": 0.0004866505041718483, "eta_seconds": 335958.7798525607, "iteration": 25, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.341817855834961} +{"data_time": 0.0004866505041718483, "eta_seconds": 335951.6603475907, "iteration": 26, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.317672729492188} +{"data_time": 0.0004967140266671777, "eta_seconds": 335956.77402072586, "iteration": 27, "lr": 5e-05, "time": 1.0032667339546606, "total_loss": 8.30541181564331} +{"data_time": 0.0004967140266671777, "eta_seconds": 335961.887657342, "iteration": 28, "lr": 5e-05, "time": 1.003421877976507, "total_loss": 8.214956283569336} +{"data_time": 0.0005116895772516727, "eta_seconds": 335954.768188891, "iteration": 29, "lr": 5e-05, "time": 1.003421877976507, "total_loss": 8.110542297363281} +{"data_time": 0.0004967140266671777, "eta_seconds": 335947.648756959, "iteration": 30, "lr": 5e-05, "time": 1.0031304199947044, "total_loss": 8.091084957122803} +{"data_time": 0.0004866505041718483, "eta_seconds": 335946.123815313, "iteration": 31, "lr": 5e-05, "time": 1.0029143589781597, "total_loss": 8.06343412399292} +{"data_time": 0.00047843088395893574, "eta_seconds": 335944.59887678386, "iteration": 32, "lr": 5e-05, "time": 1.0027774545596913, "total_loss": 8.009804964065552} +{"data_time": 0.00047843088395893574, "eta_seconds": 335904.375063678, "iteration": 33, "lr": 5e-05, "time": 1.002614525030367, "total_loss": 7.94881272315979} +{"data_time": 0.00047843088395893574, "eta_seconds": 335864.1514847451, "iteration": 34, "lr": 5e-05, "time": 1.0025354725075886, "total_loss": 7.885829925537109} +{"data_time": 0.00047843088395893574, "eta_seconds": 335847.7926784721, "iteration": 35, "lr": 5e-05, "time": 1.0025354725075886, "total_loss": 7.850605010986328} +{"data_time": 0.0004663855070248246, "eta_seconds": 335831.4339638853, "iteration": 36, "lr": 5e-05, "time": 1.0023454835172743, "total_loss": 7.840587615966797} +{"data_time": 0.0004618855891749263, "eta_seconds": 335845.78744942206, "iteration": 37, "lr": 5e-05, "time": 1.0025354725075886, "total_loss": 7.791827201843262} +{"data_time": 0.0004618855891749263, "eta_seconds": 335860.1408432727, "iteration": 38, "lr": 5e-05, "time": 1.0025354725075886, "total_loss": 7.7047717571258545} +{"data_time": 0.0004618855891749263, "eta_seconds": 335843.782220372, "iteration": 39, "lr": 5e-05, "time": 1.0023454835172743, "total_loss": 7.658276796340942} +{"data_time": 0.0004663855070248246, "eta_seconds": 335827.42368915747, "iteration": 40, "lr": 5e-05, "time": 1.0023454835172743, "total_loss": 7.65231728553772} +{"data_time": 0.00047659792471677065, "eta_seconds": 335815.2970839769, "iteration": 41, "lr": 5e-05, "time": 1.0021419414551929, "total_loss": 7.627636194229126} +{"data_time": 0.00047659792471677065, "eta_seconds": 335803.17054521525, "iteration": 42, "lr": 5e-05, "time": 1.0021419414551929, "total_loss": 7.562399387359619} +{"data_time": 0.0004887214163318276, "eta_seconds": 335813.2920130319, "iteration": 43, "lr": 5e-05, "time": 1.0023454835172743, "total_loss": 7.515562057495117} +{"data_time": 0.0004723356105387211, "eta_seconds": 335823.41341442964, "iteration": 44, "lr": 5e-05, "time": 1.0025354725075886, "total_loss": 7.481845378875732} +{"data_time": 0.0004723356105387211, "eta_seconds": 335811.2869420869, "iteration": 45, "lr": 5e-05, "time": 1.0023454835172743, "total_loss": 7.446049690246582} +{"data_time": 0.0004618855891749263, "eta_seconds": 335799.160536163, "iteration": 46, "lr": 5e-05, "time": 1.0021419414551929, "total_loss": 7.41231107711792} +{"data_time": 0.0004618855891749263, "eta_seconds": 335755.9084938001, "iteration": 47, "lr": 5e-05, "time": 1.0016267889877781, "total_loss": 7.38015341758728} +{"data_time": 0.00046312855556607246, "eta_seconds": 335712.65670370334, "iteration": 48, "lr": 5e-05, "time": 1.0011151174549013, "total_loss": 7.369422197341919} +{"data_time": 0.0004618855891749263, "eta_seconds": 335753.90374154015, "iteration": 49, "lr": 5e-05, "time": 1.0011151174549013, "total_loss": 7.342585563659668} +{"data_time": 0.00046312855556607246, "eta_seconds": 335710.6522037096, "iteration": 50, "lr": 5e-05, "time": 1.0010243909200653, "total_loss": 7.30710506439209} +{"data_time": 0.0004733409732580185, "eta_seconds": 335751.8989892802, "iteration": 51, "lr": 5e-05, "time": 1.0011151174549013, "total_loss": 7.287860631942749} +{"data_time": 0.00048657204024493694, "eta_seconds": 335708.64770371583, "iteration": 52, "lr": 5e-05, "time": 1.0010243909200653, "total_loss": 7.242215871810913} +{"data_time": 0.00048657204024493694, "eta_seconds": 335697.38030274375, "iteration": 53, "lr": 5e-05, "time": 1.0009109794627875, "total_loss": 7.149794578552246} +{"data_time": 0.00048657204024493694, "eta_seconds": 335706.6432037221, "iteration": 54, "lr": 5e-05, "time": 1.0009109794627875, "total_loss": 7.081542015075684} +{"data_time": 0.0005024125566706061, "eta_seconds": 335695.3758640429, "iteration": 55, "lr": 5e-05, "time": 1.000762126990594, "total_loss": 7.035866975784302} +{"data_time": 0.0005024125566706061, "eta_seconds": 335684.1085856566, "iteration": 56, "lr": 5e-05, "time": 1.000762126990594, "total_loss": 6.996468544006348} +{"data_time": 0.0005145600298419595, "eta_seconds": 335693.37142534205, "iteration": 57, "lr": 5e-05, "time": 1.000762126990594, "total_loss": 6.9625513553619385} +{"data_time": 0.0005145600298419595, "eta_seconds": 335702.6342037346, "iteration": 58, "lr": 5e-05, "time": 1.000762126990594, "total_loss": 6.9235382080078125} +{"data_time": 0.0005145600298419595, "eta_seconds": 335691.3669866412, "iteration": 59, "lr": 5e-05, "time": 1.000762126990594, "total_loss": 6.860132455825806} +{"data_time": 0.0005024125566706061, "eta_seconds": 335680.0998308407, "iteration": 60, "lr": 5e-05, "time": 1.0006127986125648, "total_loss": 6.807743787765503} +{"data_time": 0.0005024125566706061, "eta_seconds": 335663.9763332873, "iteration": 61, "lr": 5e-05, "time": 1.0006127986125648, "total_loss": 6.787341117858887} +{"data_time": 0.0005024125566706061, "eta_seconds": 335647.85292602493, "iteration": 62, "lr": 5e-05, "time": 1.0006127986125648, "total_loss": 6.767293930053711} +{"data_time": 0.0004917195765301585, "eta_seconds": 335646.3092202097, "iteration": 63, "lr": 5e-05, "time": 1.000430197105743, "total_loss": 6.71656608581543} +{"data_time": 0.0005035360809415579, "eta_seconds": 335645.84872919903, "iteration": 64, "lr": 5e-05, "time": 1.000430197105743, "total_loss": 6.65900993347168} +{"data_time": 0.0005035360809415579, "eta_seconds": 335659.96775905346, "iteration": 65, "lr": 5e-05, "time": 1.000430197105743, "total_loss": 6.589893341064453} +{"data_time": 0.0005160350119695067, "eta_seconds": 335674.08669861685, "iteration": 66, "lr": 5e-05, "time": 1.0006127986125648, "total_loss": 6.522312879562378} +{"data_time": 0.000525223440490663, "eta_seconds": 335683.3492318378, "iteration": 67, "lr": 5e-05, "time": 1.0009970115497708, "total_loss": 6.522312879562378} +{"data_time": 0.000525223440490663, "eta_seconds": 335692.6117037658, "iteration": 68, "lr": 5e-05, "time": 1.0022391425445676, "total_loss": 6.522312879562378} +{"data_time": 0.0005394439212977886, "eta_seconds": 335733.8562189408, "iteration": 69, "lr": 5e-05, "time": 1.0022391425445676, "total_loss": 6.522312879562378} +{"data_time": 0.0005661874311044812, "eta_seconds": 335775.10048184963, "iteration": 70, "lr": 5e-05, "time": 1.0036480699200183, "total_loss": 6.522312879562378} +{"data_time": 0.0006004689494147897, "eta_seconds": 335785.2210198017, "iteration": 71, "lr": 5e-05, "time": 1.0036480699200183, "total_loss": 6.522312879562378} +{"data_time": 0.0006004689494147897, "eta_seconds": 335795.34149133484, "iteration": 72, "lr": 5e-05, "time": 1.0046572653809562, "total_loss": 6.522312879562378} +{"data_time": 0.0006168085383251309, "eta_seconds": 335809.69332652097, "iteration": 73, "lr": 5e-05, "time": 1.2304696229984984, "total_loss": 6.522312879562378} +{"data_time": 0.0006400905549526215, "eta_seconds": 335824.04507002095, "iteration": 74, "lr": 5e-05, "time": 1.5115476195933297, "total_loss": 6.581220388412476} +{"data_time": 0.0006400905549526215, "eta_seconds": 335835.41077446984, "iteration": 75, "lr": 5e-05, "time": 1.570863317581825, "total_loss": 6.581220388412476} +{"data_time": 0.0006400905549526215, "eta_seconds": 335846.77640506276, "iteration": 76, "lr": 5e-05, "time": 1.5728586190380156, "total_loss": 6.63663649559021} +{"data_time": 0.0006678990321233869, "eta_seconds": 335872.6211466384, "iteration": 77, "lr": 5e-05, "time": 1.5749239114811644, "total_loss": 6.6805243492126465} +{"data_time": 0.0006678990321233869, "eta_seconds": 335898.4657278971, "iteration": 78, "lr": 5e-05, "time": 1.5749239114811644, "total_loss": 6.733211040496826} +{"data_time": 0.0006678990321233869, "eta_seconds": 335897.9848025385, "iteration": 79, "lr": 5e-05, "time": 1.5749239114811644, "total_loss": 6.82474946975708} +{"data_time": 0.0006678990321233869, "eta_seconds": 335897.503874063, "iteration": 80, "lr": 5e-05, "time": 1.5755315409041941, "total_loss": 7.025938987731934} +{"data_time": 0.0006678990321233869, "eta_seconds": 335902.61656118464, "iteration": 81, "lr": 5e-05, "time": 1.576498199487105, "total_loss": 7.157942771911621} +{"data_time": 0.0006745069986209273, "eta_seconds": 335895.4980787472, "iteration": 82, "lr": 5e-05, "time": 1.576498199487105, "total_loss": 7.168901443481445} +{"data_time": 0.0006745069986209273, "eta_seconds": 335893.97321814066, "iteration": 83, "lr": 5e-05, "time": 1.576498199487105, "total_loss": 7.270230054855347} +{"data_time": 0.0006745069986209273, "eta_seconds": 335893.49228343135, "iteration": 84, "lr": 5e-05, "time": 1.576498199487105, "total_loss": 7.373126029968262} +{"data_time": 0.0006678990321233869, "eta_seconds": 335898.6048975149, "iteration": 85, "lr": 5e-05, "time": 1.5755315409041941, "total_loss": 7.3882129192352295} +{"data_time": 0.0006455964175984263, "eta_seconds": 335903.71747507947, "iteration": 86, "lr": 5e-05, "time": 1.5749239114811644, "total_loss": 7.511432647705078} +{"data_time": 0.0006455964175984263, "eta_seconds": 335896.59906568006, "iteration": 87, "lr": 5e-05, "time": 1.5744548824150115, "total_loss": 7.660311460494995} +{"data_time": 0.0006237048655748367, "eta_seconds": 335889.48069279967, "iteration": 88, "lr": 5e-05, "time": 1.5744548824150115, "total_loss": 7.660311460494995} +{"data_time": 0.0006172249559313059, "eta_seconds": 335887.95584154385, "iteration": 89, "lr": 5e-05, "time": 1.5723895899718627, "total_loss": 7.660311460494995} +{"data_time": 0.0006134555442258716, "eta_seconds": 335886.43099340494, "iteration": 90, "lr": 5e-05, "time": 1.5642896714853123, "total_loss": 7.660311460494995} +{"data_time": 0.0006046970374882221, "eta_seconds": 335858.58174528275, "iteration": 91, "lr": 5e-05, "time": 1.281660840031691, "total_loss": 7.660311460494995} +{"data_time": 0.0006046970374882221, "eta_seconds": 335830.7326574775, "iteration": 92, "lr": 5e-05, "time": 1.0057461184915155, "total_loss": 7.660311460494995} +{"data_time": 0.0005906664300709963, "eta_seconds": 335817.3622231402, "iteration": 93, "lr": 5e-05, "time": 1.0044573384802788, "total_loss": 7.660311460494995} +{"data_time": 0.0005754574667662382, "eta_seconds": 335803.9918626589, "iteration": 94, "lr": 5e-05, "time": 1.0022892700508237, "total_loss": 7.660311460494995} +{"data_time": 0.0005505949957296252, "eta_seconds": 335787.6358069703, "iteration": 95, "lr": 5e-05, "time": 1.000986878061667, "total_loss": 7.660311460494995} +{"data_time": 0.0005505949957296252, "eta_seconds": 335771.27984296787, "iteration": 96, "lr": 5e-05, "time": 1.0008598279673606, "total_loss": 7.660311460494995} +{"data_time": 0.0005298228934407234, "eta_seconds": 335785.63057792024, "iteration": 97, "lr": 5e-05, "time": 1.0008598279673606, "total_loss": 7.563601493835449} +{"data_time": 0.0005045279394835234, "eta_seconds": 335769.27470560395, "iteration": 98, "lr": 5e-05, "time": 1.000584620400332, "total_loss": 7.44744873046875} +{"data_time": 0.0005026309518143535, "eta_seconds": 335757.1500265715, "iteration": 99, "lr": 5e-05, "time": 1.0003045349149033, "total_loss": 7.364323377609253} +{"data_time": 0.0005003460682928562, "eta_seconds": 335767.26956824004, "iteration": 100, "lr": 5e-05, "time": 1.0003045349149033, "total_loss": 7.264545679092407} +{"data_time": 0.0004762645112350583, "eta_seconds": 335755.14495562646, "iteration": 101, "lr": 5e-05, "time": 1.0001681599533185, "total_loss": 7.165539264678955} +{"data_time": 0.0004736244445666671, "eta_seconds": 335743.0204094318, "iteration": 102, "lr": 5e-05, "time": 1.000020214007236, "total_loss": 7.1342692375183105} +{"data_time": 0.0004493099404498935, "eta_seconds": 335699.7754305219, "iteration": 103, "lr": 5e-05, "time": 0.9998376205330715, "total_loss": 7.125414848327637} +{"data_time": 0.00042519893031567335, "eta_seconds": 335656.53070387826, "iteration": 104, "lr": 5e-05, "time": 0.9993635170394555, "total_loss": 7.094574451446533} +{"data_time": 0.00042519893031567335, "eta_seconds": 335697.770678262, "iteration": 105, "lr": 5e-05, "time": 0.9993635170394555, "total_loss": 7.059347152709961} +{"data_time": 0.00042519893031567335, "eta_seconds": 335739.01040037954, "iteration": 106, "lr": 5e-05, "time": 0.9993635170394555, "total_loss": 7.021731853485107} +{"data_time": 0.00044809747487306595, "eta_seconds": 335695.76592600206, "iteration": 107, "lr": 5e-05, "time": 0.9995010064449161, "total_loss": 6.9736692905426025} +{"data_time": 0.00044809747487306595, "eta_seconds": 335652.52170389076, "iteration": 108, "lr": 5e-05, "time": 0.9995010064449161, "total_loss": 6.9234254360198975} +{"data_time": 0.00044809747487306595, "eta_seconds": 335641.2560191199, "iteration": 109, "lr": 5e-05, "time": 0.9989435654133558, "total_loss": 6.885960102081299} +{"data_time": 0.00047142442781478167, "eta_seconds": 335629.9903956419, "iteration": 110, "lr": 5e-05, "time": 0.9995010064449161, "total_loss": 6.848095893859863} +{"data_time": 0.00047142442781478167, "eta_seconds": 335613.8691553641, "iteration": 111, "lr": 5e-05, "time": 0.9995010064449161, "total_loss": 6.814356327056885} +{"data_time": 0.0004736244445666671, "eta_seconds": 335597.74800537736, "iteration": 112, "lr": 5e-05, "time": 1.000020214007236, "total_loss": 6.808745384216309} +{"data_time": 0.0004752769600600004, "eta_seconds": 335596.2043804126, "iteration": 113, "lr": 5e-05, "time": 0.9995010064449161, "total_loss": 6.807533502578735} +{"data_time": 0.000477167428471148, "eta_seconds": 335594.6607586818, "iteration": 114, "lr": 5e-05, "time": 0.9993316879263148, "total_loss": 6.798972129821777} +{"data_time": 0.0004972110036760569, "eta_seconds": 335569.0715555465, "iteration": 115, "lr": 5e-05, "time": 0.9993316879263148, "total_loss": 6.786883592605591} +{"data_time": 0.0005177495768293738, "eta_seconds": 335543.4824992472, "iteration": 116, "lr": 5e-05, "time": 0.9998057914199308, "total_loss": 6.781697988510132} +{"data_time": 0.0005292636342346668, "eta_seconds": 335495.63044767827, "iteration": 117, "lr": 5e-05, "time": 0.9993316879263148, "total_loss": 6.774333715438843} +{"data_time": 0.0005292636342346668, "eta_seconds": 335447.77867590333, "iteration": 118, "lr": 5e-05, "time": 0.9993316879263148, "total_loss": 6.765953063964844} +{"data_time": 0.0005363236414268613, "eta_seconds": 335414.12896038033, "iteration": 119, "lr": 5e-05, "time": 0.9993316879263148, "total_loss": 6.7494056224823} +{"data_time": 0.0005416536005213857, "eta_seconds": 335380.4794398362, "iteration": 120, "lr": 5e-05, "time": 0.9980595845263451, "total_loss": 6.7302350997924805} +{"data_time": 0.0005493535427376628, "eta_seconds": 335352.89003759134, "iteration": 121, "lr": 5e-05, "time": 0.9968552130740136, "total_loss": 6.7302350997924805} +{"data_time": 0.000557669554837048, "eta_seconds": 335325.3007941344, "iteration": 122, "lr": 5e-05, "time": 0.9965024429839104, "total_loss": 6.723856210708618} +{"data_time": 0.0005721021443605423, "eta_seconds": 335350.88724923925, "iteration": 123, "lr": 5e-05, "time": 0.996739496011287, "total_loss": 6.718602180480957} +{"data_time": 0.0005721021443605423, "eta_seconds": 335323.29816457024, "iteration": 124, "lr": 5e-05, "time": 0.9956989195197821, "total_loss": 6.713461637496948} +{"data_time": 0.0005721021443605423, "eta_seconds": 335296.1119188424, "iteration": 125, "lr": 5e-05, "time": 0.9948145485250279, "total_loss": 6.711553335189819} +{"data_time": 0.0005721021443605423, "eta_seconds": 335268.92582949763, "iteration": 126, "lr": 5e-05, "time": 0.9940880704671144, "total_loss": 6.700226306915283} +{"data_time": 0.0005721021443605423, "eta_seconds": 335265.0480728783, "iteration": 127, "lr": 5e-05, "time": 0.9940880704671144, "total_loss": 6.685318231582642} +{"data_time": 0.0005721021443605423, "eta_seconds": 335261.1703334388, "iteration": 128, "lr": 5e-05, "time": 0.9940880704671144, "total_loss": 6.685318231582642} +{"data_time": 0.0005721021443605423, "eta_seconds": 335250.8016203104, "iteration": 129, "lr": 5e-05, "time": 0.9944153794785962, "total_loss": 6.682970762252808} +{"data_time": 0.0005493981298059225, "eta_seconds": 335240.4329631282, "iteration": 130, "lr": 5e-05, "time": 0.9944153794785962, "total_loss": 6.68656587600708} +{"data_time": 0.0005584561731666327, "eta_seconds": 335237.18199300393, "iteration": 131, "lr": 5e-05, "time": 0.993885466014035, "total_loss": 6.682970762252808} +{"data_time": 0.0005584561731666327, "eta_seconds": 335233.9310363168, "iteration": 132, "lr": 5e-05, "time": 0.9935553579125553, "total_loss": 6.682970762252808} +{"data_time": 0.0005584561731666327, "eta_seconds": 335226.1645225459, "iteration": 133, "lr": 5e-05, "time": 0.9935553579125553, "total_loss": 6.682970762252808} +{"data_time": 0.0005584561731666327, "eta_seconds": 335218.39804918086, "iteration": 134, "lr": 5e-05, "time": 0.9935553579125553, "total_loss": 6.6753246784210205} +{"data_time": 0.00058128556702286, "eta_seconds": 335213.74423422944, "iteration": 135, "lr": 5e-05, "time": 0.993885466014035, "total_loss": 6.6753246784210205} +{"data_time": 0.0006747349398210645, "eta_seconds": 335209.090441094, "iteration": 136, "lr": 5e-05, "time": 0.9935553579125553, "total_loss": 6.65562105178833} +{"data_time": 0.0005840284284204245, "eta_seconds": 335207.17007183586, "iteration": 137, "lr": 5e-05, "time": 0.9936283569550142, "total_loss": 6.65562105178833} +{"data_time": 0.0005840284284204245, "eta_seconds": 335205.24970806856, "iteration": 138, "lr": 5e-05, "time": 0.9939556659664959, "total_loss": 6.622169494628906} +{"data_time": 0.00058128556702286, "eta_seconds": 335199.6983238603, "iteration": 139, "lr": 5e-05, "time": 0.9946119440719485, "total_loss": 6.598361492156982} +{"data_time": 0.000555802951566875, "eta_seconds": 335194.1469668292, "iteration": 140, "lr": 5e-05, "time": 0.9946119440719485, "total_loss": 6.598361492156982} +{"data_time": 0.0005242860643193126, "eta_seconds": 335191.2503376163, "iteration": 141, "lr": 5e-05, "time": 0.9946545810671523, "total_loss": 6.598361492156982} +{"data_time": 0.0005137396510690451, "eta_seconds": 335188.3537197253, "iteration": 142, "lr": 5e-05, "time": 0.9954509395174682, "total_loss": 6.598361492156982} +{"data_time": 0.0005137396510690451, "eta_seconds": 335174.436885003, "iteration": 143, "lr": 5e-05, "time": 0.9946545810671523, "total_loss": 6.5924601554870605} +{"data_time": 0.0005137396510690451, "eta_seconds": 335160.5201274217, "iteration": 144, "lr": 5e-05, "time": 0.9954509395174682, "total_loss": 6.5924601554870605} +{"data_time": 0.0005242860643193126, "eta_seconds": 335152.2409832876, "iteration": 145, "lr": 5e-05, "time": 0.9954509395174682, "total_loss": 6.574693918228149} +{"data_time": 0.000555802951566875, "eta_seconds": 335143.9618826236, "iteration": 146, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.574693918228149} +{"data_time": 0.0005840284284204245, "eta_seconds": 335140.64378096955, "iteration": 147, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.5924601554870605} +{"data_time": 0.0006746329599991441, "eta_seconds": 335141.9601925977, "iteration": 148, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.5924601554870605} +{"data_time": 0.0007704079616814852, "eta_seconds": 335138.6421047838, "iteration": 149, "lr": 5e-05, "time": 0.9964497874025255, "total_loss": 6.574693918228149} +{"data_time": 0.0007704079616814852, "eta_seconds": 335135.32403081004, "iteration": 150, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.574693918228149} +{"data_time": 0.0007344270125031471, "eta_seconds": 335127.993125611, "iteration": 151, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.574693918228149} +{"data_time": 0.0006719154771417379, "eta_seconds": 335120.6622582197, "iteration": 152, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.555528879165649} +{"data_time": 0.0006618404295295477, "eta_seconds": 335100.5540199692, "iteration": 153, "lr": 5e-05, "time": 0.9954509395174682, "total_loss": 6.5430848598480225} +{"data_time": 0.0006618404295295477, "eta_seconds": 335080.44589584274, "iteration": 154, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.5430848598480225} +{"data_time": 0.0006308654556050897, "eta_seconds": 335060.4153818642, "iteration": 155, "lr": 5e-05, "time": 0.9948097829474136, "total_loss": 6.538717031478882} +{"data_time": 0.0006150599801912904, "eta_seconds": 335040.3849815468, "iteration": 156, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.538717031478882} +{"data_time": 0.0006150599801912904, "eta_seconds": 335058.4141370433, "iteration": 157, "lr": 5e-05, "time": 0.9964497874025255, "total_loss": 6.538717031478882} +{"data_time": 0.0006308654556050897, "eta_seconds": 335038.38385038706, "iteration": 158, "lr": 5e-05, "time": 0.996667968458496, "total_loss": 6.538717031478882} +{"data_time": 0.0006618404295295477, "eta_seconds": 335034.1647824049, "iteration": 159, "lr": 5e-05, "time": 0.996667968458496, "total_loss": 6.531677722930908} +{"data_time": 0.00069171201903373, "eta_seconds": 335029.9457336464, "iteration": 160, "lr": 5e-05, "time": 0.9969171308912337, "total_loss": 6.540605068206787} +{"data_time": 0.0007017870666459203, "eta_seconds": 335020.0469476585, "iteration": 161, "lr": 5e-05, "time": 0.9970375073608011, "total_loss": 6.531677722930908} +{"data_time": 0.0007344270125031471, "eta_seconds": 335010.1482148189, "iteration": 162, "lr": 5e-05, "time": 0.9969171308912337, "total_loss": 6.5231032371521} +{"data_time": 0.0008766170358285308, "eta_seconds": 335018.0459080944, "iteration": 163, "lr": 5e-05, "time": 0.9970375073608011, "total_loss": 6.5231032371521} +{"data_time": 0.0009251345181837678, "eta_seconds": 335008.147228403, "iteration": 164, "lr": 5e-05, "time": 0.9969171308912337, "total_loss": 6.5231032371521} +{"data_time": 0.0007547514978796244, "eta_seconds": 335005.9995710214, "iteration": 165, "lr": 5e-05, "time": 0.9969171308912337, "total_loss": 6.503211975097656} +{"data_time": 0.0007061564829200506, "eta_seconds": 335003.8519204918, "iteration": 166, "lr": 5e-05, "time": 0.9969171308912337, "total_loss": 6.503211975097656} +{"data_time": 0.0007017095340415835, "eta_seconds": 335003.9985914575, "iteration": 167, "lr": 5e-05, "time": 0.9970375073608011, "total_loss": 6.503211975097656} +{"data_time": 0.00069171201903373, "eta_seconds": 335001.85094777984, "iteration": 168, "lr": 5e-05, "time": 0.9954168034018949, "total_loss": 6.503211975097656} +{"data_time": 0.0006777989910915494, "eta_seconds": 334990.38944235747, "iteration": 169, "lr": 5e-05, "time": 0.9954168034018949, "total_loss": 6.527876853942871} +{"data_time": 0.0006777989910915494, "eta_seconds": 334978.92799941916, "iteration": 170, "lr": 5e-05, "time": 0.9970375073608011, "total_loss": 6.519261360168457} +{"data_time": 0.0006468240171670914, "eta_seconds": 334988.3885321296, "iteration": 171, "lr": 5e-05, "time": 0.9974556299857795, "total_loss": 6.5367631912231445} +{"data_time": 0.0006377500249072909, "eta_seconds": 334976.92715167534, "iteration": 172, "lr": 5e-05, "time": 0.998238944564946, "total_loss": 6.5454795360565186} +{"data_time": 0.0005667039658874273, "eta_seconds": 334971.06126818527, "iteration": 173, "lr": 5e-05, "time": 0.9987658020108938, "total_loss": 6.5454795360565186} +{"data_time": 0.0005667039658874273, "eta_seconds": 334974.9263039315, "iteration": 174, "lr": 5e-05, "time": 0.9989636850077659, "total_loss": 6.5454795360565186} +{"data_time": 0.0005667039658874273, "eta_seconds": 334984.3867116738, "iteration": 175, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.5454795360565186} +{"data_time": 0.0005272645503282547, "eta_seconds": 334993.84705693205, "iteration": 176, "lr": 5e-05, "time": 0.9995363809866831, "total_loss": 6.5551230907440186} +{"data_time": 0.0005272645503282547, "eta_seconds": 334982.3858014459, "iteration": 177, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.5551230907440186} +{"data_time": 0.0005272645503282547, "eta_seconds": 334970.92460844386, "iteration": 178, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.549628496170044} +{"data_time": 0.0005272645503282547, "eta_seconds": 334980.38489121804, "iteration": 179, "lr": 5e-05, "time": 0.9995363809866831, "total_loss": 6.559171199798584} +{"data_time": 0.000523862661793828, "eta_seconds": 334968.92376070004, "iteration": 180, "lr": 5e-05, "time": 0.9995363809866831, "total_loss": 6.559171199798584} +{"data_time": 0.000523862661793828, "eta_seconds": 334963.05799345765, "iteration": 181, "lr": 5e-05, "time": 0.9995363809866831, "total_loss": 6.559171199798584} +{"data_time": 0.0005238075973466039, "eta_seconds": 334957.1922552772, "iteration": 182, "lr": 5e-05, "time": 0.9995363809866831, "total_loss": 6.559171199798584} +{"data_time": 0.0005238075973466039, "eta_seconds": 334946.69257010054, "iteration": 183, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.559171199798584} +{"data_time": 0.0005238075973466039, "eta_seconds": 334936.1929416659, "iteration": 184, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.549854278564453} +{"data_time": 0.0005270520923659205, "eta_seconds": 334931.1697197673, "iteration": 185, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.549854278564453} +{"data_time": 0.0005270520923659205, "eta_seconds": 334926.1465218987, "iteration": 186, "lr": 5e-05, "time": 0.9987675820011646, "total_loss": 6.545806169509888} +{"data_time": 0.0005272094858810306, "eta_seconds": 334917.94152040733, "iteration": 187, "lr": 5e-05, "time": 0.9984382524853572, "total_loss": 6.5402257442474365} +{"data_time": 0.0005272094858810306, "eta_seconds": 334909.73656195216, "iteration": 188, "lr": 5e-05, "time": 0.9984382524853572, "total_loss": 6.5315093994140625} +{"data_time": 0.0005275964504107833, "eta_seconds": 334898.5958666769, "iteration": 189, "lr": 5e-05, "time": 0.9981814610073343, "total_loss": 6.5315093994140625} +{"data_time": 0.000542824505828321, "eta_seconds": 334887.45523197437, "iteration": 190, "lr": 5e-05, "time": 0.9971339404582977, "total_loss": 6.531692981719971} +{"data_time": 0.000542824505828321, "eta_seconds": 334862.1622792296, "iteration": 191, "lr": 5e-05, "time": 0.9960323289269581, "total_loss": 6.520740032196045} +{"data_time": 0.0005700605688616633, "eta_seconds": 334836.86947159586, "iteration": 192, "lr": 5e-05, "time": 0.9951661893865094, "total_loss": 6.5143210887908936} +{"data_time": 0.0006128224777057767, "eta_seconds": 334826.8279172762, "iteration": 193, "lr": 5e-05, "time": 0.9951661893865094, "total_loss": 6.5220959186553955} +{"data_time": 0.0006128224777057767, "eta_seconds": 334816.78641696554, "iteration": 194, "lr": 5e-05, "time": 0.9945189794525504, "total_loss": 6.531692981719971} +{"data_time": 0.000619517988525331, "eta_seconds": 334809.72645719815, "iteration": 195, "lr": 5e-05, "time": 0.9944486394524574, "total_loss": 6.531692981719971} +{"data_time": 0.0006571760168299079, "eta_seconds": 334802.6665336299, "iteration": 196, "lr": 5e-05, "time": 0.9942038529552519, "total_loss": 6.5220959186553955} +{"data_time": 0.0006571760168299079, "eta_seconds": 334796.45443529007, "iteration": 197, "lr": 5e-05, "time": 0.9934806060045958, "total_loss": 6.496842384338379} +{"data_time": 0.0006424394669011235, "eta_seconds": 334790.2423680851, "iteration": 198, "lr": 5e-05, "time": 0.9935311529552564, "total_loss": 6.485797643661499} +{"data_time": 0.0006424394669011235, "eta_seconds": 334788.8821617225, "iteration": 199, "lr": 5e-05, "time": 0.9930527874967083, "total_loss": 6.476698160171509} +{"data_time": 0.0006619264604523778, "eta_seconds": 334787.521957512, "iteration": 200, "lr": 5e-05, "time": 0.9928158218972385, "total_loss": 6.476698160171509} +{"data_time": 0.0006619264604523778, "eta_seconds": 334774.9385753176, "iteration": 201, "lr": 5e-05, "time": 0.9926744998665527, "total_loss": 6.476698160171509} +{"data_time": 0.0006824899464845657, "eta_seconds": 334762.35526231816, "iteration": 202, "lr": 5e-05, "time": 0.9926744998665527, "total_loss": 6.483594655990601} +{"data_time": 0.0007025444647297263, "eta_seconds": 334732.4790458693, "iteration": 203, "lr": 5e-05, "time": 0.9925029299920425, "total_loss": 6.476698160171509} +{"data_time": 0.0007025444647297263, "eta_seconds": 334702.6030019175, "iteration": 204, "lr": 5e-05, "time": 0.9925029299920425, "total_loss": 6.476698160171509} +{"data_time": 0.0007025444647297263, "eta_seconds": 334690.9468510486, "iteration": 205, "lr": 5e-05, "time": 0.9926744998665527, "total_loss": 6.476698160171509} +{"data_time": 0.0007025444647297263, "eta_seconds": 334679.290763838, "iteration": 206, "lr": 5e-05, "time": 0.9926744998665527, "total_loss": 6.483594655990601} +{"data_time": 0.0006630029529333115, "eta_seconds": 334645.7890883088, "iteration": 207, "lr": 5e-05, "time": 0.9926744998665527, "total_loss": 6.483594655990601} +{"data_time": 0.0006630029529333115, "eta_seconds": 334612.2876069376, "iteration": 208, "lr": 5e-05, "time": 0.9928079464007169, "total_loss": 6.491384506225586} +{"data_time": 0.0007025444647297263, "eta_seconds": 334617.1578535547, "iteration": 209, "lr": 5e-05, "time": 0.992936848429963, "total_loss": 6.491384506225586} +{"data_time": 0.0007025444647297263, "eta_seconds": 334610.28872230765, "iteration": 210, "lr": 5e-05, "time": 0.9927955263992772, "total_loss": 6.483594655990601} +{"data_time": 0.0007025444647297263, "eta_seconds": 334598.24704700103, "iteration": 211, "lr": 5e-05, "time": 0.9927955263992772, "total_loss": 6.482285022735596} +{"data_time": 0.0006753400666639209, "eta_seconds": 334586.20543765835, "iteration": 212, "lr": 5e-05, "time": 0.9927955263992772, "total_loss": 6.473680734634399} +{"data_time": 0.0006651415023952723, "eta_seconds": 334544.1248098032, "iteration": 213, "lr": 5e-05, "time": 0.9927955263992772, "total_loss": 6.472007989883423} +{"data_time": 0.0006651415023952723, "eta_seconds": 334502.0444273602, "iteration": 214, "lr": 5e-05, "time": 0.9927955263992772, "total_loss": 6.472007989883423} +{"data_time": 0.0006651415023952723, "eta_seconds": 334490.33834068105, "iteration": 215, "lr": 5e-05, "time": 0.9930449120001867, "total_loss": 6.471388816833496} +{"data_time": 0.0006651415023952723, "eta_seconds": 334478.63231796375, "iteration": 216, "lr": 5e-05, "time": 0.9932783750118688, "total_loss": 6.471388816833496} +{"data_time": 0.0006753400666639209, "eta_seconds": 334476.746382016, "iteration": 217, "lr": 5e-05, "time": 0.9932783750118688, "total_loss": 6.463591575622559} +{"data_time": 0.0007049249252304435, "eta_seconds": 334474.8604513663, "iteration": 218, "lr": 5e-05, "time": 0.9931024410761893, "total_loss": 6.463591575622559} +{"data_time": 0.0007049249252304435, "eta_seconds": 334468.3186378875, "iteration": 219, "lr": 5e-05, "time": 0.9930178551003337, "total_loss": 6.463591575622559} +{"data_time": 0.0007908979896456003, "eta_seconds": 334461.7768575207, "iteration": 220, "lr": 5e-05, "time": 0.9930178551003337, "total_loss": 6.455276966094971} +{"data_time": 0.0007908979896456003, "eta_seconds": 334460.49928020383, "iteration": 221, "lr": 5e-05, "time": 0.993219965021126, "total_loss": 6.455276966094971} +{"data_time": 0.0007008734392002225, "eta_seconds": 334459.221704551, "iteration": 222, "lr": 5e-05, "time": 0.993219965021126, "total_loss": 6.4539055824279785} +{"data_time": 0.0007008734392002225, "eta_seconds": 334456.71181206405, "iteration": 223, "lr": 5e-05, "time": 0.993219965021126, "total_loss": 6.455276966094971} +{"data_time": 0.0007008734392002225, "eta_seconds": 334454.201928603, "iteration": 224, "lr": 5e-05, "time": 0.993219965021126, "total_loss": 6.462490081787109} +{"data_time": 0.0007008734392002225, "eta_seconds": 334435.0603942699, "iteration": 225, "lr": 5e-05, "time": 0.9930178551003337, "total_loss": 6.471423864364624} +{"data_time": 0.0006712885806336999, "eta_seconds": 334415.9189683208, "iteration": 226, "lr": 5e-05, "time": 0.9930178551003337, "total_loss": 6.462490081787109} +{"data_time": 0.0006874479586258531, "eta_seconds": 334406.67357347906, "iteration": 227, "lr": 5e-05, "time": 0.9926686605904251, "total_loss": 6.462490081787109} +{"data_time": 0.0006874479586258531, "eta_seconds": 334397.4282279024, "iteration": 228, "lr": 5e-05, "time": 0.9926686605904251, "total_loss": 6.462490081787109} +{"data_time": 0.0006712885806336999, "eta_seconds": 334377.3199458297, "iteration": 229, "lr": 5e-05, "time": 0.9923333375481889, "total_loss": 6.4539055824279785} +{"data_time": 0.0006712885806336999, "eta_seconds": 334357.21177791874, "iteration": 230, "lr": 5e-05, "time": 0.9926686605904251, "total_loss": 6.4539055824279785} +{"data_time": 0.0006874479586258531, "eta_seconds": 334348.7153953817, "iteration": 231, "lr": 5e-05, "time": 0.9923333375481889, "total_loss": 6.451754808425903} +{"data_time": 0.0007774725090712309, "eta_seconds": 334340.21905763657, "iteration": 232, "lr": 5e-05, "time": 0.9923333375481889, "total_loss": 6.451754808425903} +{"data_time": 0.0008144070161506534, "eta_seconds": 334327.28044745955, "iteration": 233, "lr": 5e-05, "time": 0.9922797135077417, "total_loss": 6.448373079299927} +{"data_time": 0.0008144070161506534, "eta_seconds": 334314.3419086137, "iteration": 234, "lr": 5e-05, "time": 0.9923333375481889, "total_loss": 6.44198203086853} +{"data_time": 0.0008661691099405289, "eta_seconds": 334312.0323186843, "iteration": 235, "lr": 5e-05, "time": 0.9923333375481889, "total_loss": 6.44198203086853} +{"data_time": 0.0008661691099405289, "eta_seconds": 334309.7227365868, "iteration": 236, "lr": 5e-05, "time": 0.9922797135077417, "total_loss": 6.448373079299927} +{"data_time": 0.0008144070161506534, "eta_seconds": 334291.178135667, "iteration": 237, "lr": 5e-05, "time": 0.9922797135077417, "total_loss": 6.451754808425903} +{"data_time": 0.0008245260687544942, "eta_seconds": 334272.63363957126, "iteration": 238, "lr": 5e-05, "time": 0.9921245854347944, "total_loss": 6.448373079299927} +{"data_time": 0.0007793945260345936, "eta_seconds": 334247.5139494054, "iteration": 239, "lr": 5e-05, "time": 0.9919842553790659, "total_loss": 6.44198203086853} +{"data_time": 0.0007429574616253376, "eta_seconds": 334270.63660499733, "iteration": 240, "lr": 5e-05, "time": 0.9921245854347944, "total_loss": 6.44198203086853} +{"data_time": 0.0007429574616253376, "eta_seconds": 334287.183856871, "iteration": 241, "lr": 5e-05, "time": 0.9921245854347944, "total_loss": 6.435945987701416} +{"data_time": 0.0007429574616253376, "eta_seconds": 334303.73100392055, "iteration": 242, "lr": 5e-05, "time": 0.9921245854347944, "total_loss": 6.435945987701416} +{"data_time": 0.0007060229545459151, "eta_seconds": 334304.0433104681, "iteration": 243, "lr": 5e-05, "time": 0.992783167399466, "total_loss": 6.435945987701416} +{"data_time": 0.0007060229545459151, "eta_seconds": 334304.35560918367, "iteration": 244, "lr": 5e-05, "time": 0.992783167399466, "total_loss": 6.433191299438477} +{"data_time": 0.0006449039792641997, "eta_seconds": 334315.29646015656, "iteration": 245, "lr": 5e-05, "time": 0.9935508704511449, "total_loss": 6.42838454246521} +{"data_time": 0.0006449039792641997, "eta_seconds": 334326.2372397983, "iteration": 246, "lr": 5e-05, "time": 0.9941959194839001, "total_loss": 6.4238059520721436} +{"data_time": 0.0005435205530375242, "eta_seconds": 334332.7358166594, "iteration": 247, "lr": 5e-05, "time": 0.9958192268386483, "total_loss": 6.4238059520721436} +{"data_time": 0.0005435205530375242, "eta_seconds": 334339.23434872855, "iteration": 248, "lr": 5e-05, "time": 0.9958192268386483, "total_loss": 6.419218301773071} +{"data_time": 0.000493954517878592, "eta_seconds": 334357.3438828897, "iteration": 249, "lr": 5e-05, "time": 0.9990725318202749, "total_loss": 6.421024799346924} +{"data_time": 0.0004903025692328811, "eta_seconds": 334337.2368565963, "iteration": 250, "lr": 5e-05, "time": 0.9990725318202749, "total_loss": 6.417817831039429} +{"data_time": 0.0004903025692328811, "eta_seconds": 334328.74092197884, "iteration": 251, "lr": 5e-05, "time": 0.9999000079696998, "total_loss": 6.417817831039429} +{"data_time": 0.00048560311552137136, "eta_seconds": 334335.23936446407, "iteration": 252, "lr": 5e-05, "time": 1.0011805914109573, "total_loss": 6.417817831039429} +{"data_time": 0.00048067199531942606, "eta_seconds": 334326.74347463856, "iteration": 253, "lr": 5e-05, "time": 1.0011805914109573, "total_loss": 6.417817831039429} +{"data_time": 0.00048067199531942606, "eta_seconds": 334333.2418723318, "iteration": 254, "lr": 5e-05, "time": 1.0014799559721723, "total_loss": 6.413652420043945} +{"data_time": 0.00048067199531942606, "eta_seconds": 334351.35106400773, "iteration": 255, "lr": 5e-05, "time": 1.0017993689980358, "total_loss": 6.404175043106079} +{"data_time": 0.00048067199531942606, "eta_seconds": 334331.2443801996, "iteration": 256, "lr": 5e-05, "time": 1.0017993689980358, "total_loss": 6.404175043106079} +{"data_time": 0.00048560311552137136, "eta_seconds": 334322.748579958, "iteration": 257, "lr": 5e-05, "time": 1.0017993689980358, "total_loss": 6.393757343292236} +{"data_time": 0.00048560311552137136, "eta_seconds": 334314.25282450835, "iteration": 258, "lr": 5e-05, "time": 1.0017993689980358, "total_loss": 6.393757343292236} +{"data_time": 0.00048560311552137136, "eta_seconds": 334308.9179540165, "iteration": 259, "lr": 5e-05, "time": 1.0017993689980358, "total_loss": 6.389266014099121} +{"data_time": 0.0004903025692328811, "eta_seconds": 334303.58310943167, "iteration": 260, "lr": 5e-05, "time": 1.0014799559721723, "total_loss": 6.383808374404907} +{"data_time": 0.000493954517878592, "eta_seconds": 334294.9816671086, "iteration": 261, "lr": 5e-05, "time": 1.0011805914109573, "total_loss": 6.383808374404907} +{"data_time": 0.0005103294970467687, "eta_seconds": 334286.3802702096, "iteration": 262, "lr": 5e-05, "time": 0.9999000079696998, "total_loss": 6.383808374404907} +{"data_time": 0.0005292044952511787, "eta_seconds": 334284.0707899276, "iteration": 263, "lr": 5e-05, "time": 0.998466563061811, "total_loss": 6.383808374404907} +{"data_time": 0.0005292044952511787, "eta_seconds": 334281.76131747756, "iteration": 264, "lr": 5e-05, "time": 0.9980369674740359, "total_loss": 6.383808374404907} +{"data_time": 0.0005431510508060455, "eta_seconds": 334263.2181840949, "iteration": 265, "lr": 5e-05, "time": 0.9966902529122308, "total_loss": 6.373529672622681} +{"data_time": 0.0005570600042119622, "eta_seconds": 334244.6751555363, "iteration": 266, "lr": 5e-05, "time": 0.9954996254527941, "total_loss": 6.373529672622681} +{"data_time": 0.0005570600042119622, "eta_seconds": 334219.55748286564, "iteration": 267, "lr": 5e-05, "time": 0.9953462380217388, "total_loss": 6.373529672622681} +{"data_time": 0.0005858434597030282, "eta_seconds": 334194.4399543018, "iteration": 268, "lr": 5e-05, "time": 0.9947797351051122, "total_loss": 6.373529672622681} +{"data_time": 0.0007000388577580452, "eta_seconds": 334191.10474916664, "iteration": 269, "lr": 5e-05, "time": 0.9942065631039441, "total_loss": 6.373529672622681} +{"data_time": 0.0007000388577580452, "eta_seconds": 334187.7695579936, "iteration": 270, "lr": 5e-05, "time": 0.9939530480187386, "total_loss": 6.383808374404907} +{"data_time": 0.0007000388577580452, "eta_seconds": 334180.1698740199, "iteration": 271, "lr": 5e-05, "time": 0.9936071829870343, "total_loss": 6.399683713912964} +{"data_time": 0.000794482883065939, "eta_seconds": 334172.5702294882, "iteration": 272, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.383808374404907} +{"data_time": 0.0008036409271880984, "eta_seconds": 334166.3492401871, "iteration": 273, "lr": 5e-05, "time": 0.9936071829870343, "total_loss": 6.383808374404907} +{"data_time": 0.000794482883065939, "eta_seconds": 334160.1282820909, "iteration": 274, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.401249170303345} +{"data_time": 0.0007486874237656593, "eta_seconds": 334154.13609696925, "iteration": 275, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.424301862716675} +{"data_time": 0.0007550184382125735, "eta_seconds": 334158.1317049486, "iteration": 276, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.414404392242432} +{"data_time": 0.0007550184382125735, "eta_seconds": 334152.13954966515, "iteration": 277, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.424301862716675} +{"data_time": 0.0007534719770774245, "eta_seconds": 334146.1474242199, "iteration": 278, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.424301862716675} +{"data_time": 0.0007136190542951226, "eta_seconds": 334144.9861645689, "iteration": 279, "lr": 5e-05, "time": 0.9928738559829071, "total_loss": 6.424301862716675} +{"data_time": 0.0006624269299209118, "eta_seconds": 334144.15090675396, "iteration": 280, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.424301862716675} +{"data_time": 0.0006144349463284016, "eta_seconds": 334142.98964807694, "iteration": 281, "lr": 5e-05, "time": 0.9930181190138683, "total_loss": 6.420524597167969} +{"data_time": 0.0006144349463284016, "eta_seconds": 334142.15438928804, "iteration": 282, "lr": 5e-05, "time": 0.9932688130065799, "total_loss": 6.414404392242432} +{"data_time": 0.0006144349463284016, "eta_seconds": 334140.99313158495, "iteration": 283, "lr": 5e-05, "time": 0.9932688130065799, "total_loss": 6.4124915599823} +{"data_time": 0.0006227684207260609, "eta_seconds": 334139.8318748558, "iteration": 284, "lr": 5e-05, "time": 0.9934222700539976, "total_loss": 6.414404392242432} +{"data_time": 0.0006227684207260609, "eta_seconds": 334111.2649005472, "iteration": 285, "lr": 5e-05, "time": 0.9930181190138683, "total_loss": 6.414404392242432} +{"data_time": 0.0006676295306533575, "eta_seconds": 334082.6980909647, "iteration": 286, "lr": 5e-05, "time": 0.9930181190138683, "total_loss": 6.414404392242432} +{"data_time": 0.0007136190542951226, "eta_seconds": 334078.7280201912, "iteration": 287, "lr": 5e-05, "time": 0.9934222700539976, "total_loss": 6.4124915599823} +{"data_time": 0.0007136190542951226, "eta_seconds": 334074.7579671757, "iteration": 288, "lr": 5e-05, "time": 0.9934222700539976, "total_loss": 6.397109031677246} +{"data_time": 0.0006954745622351766, "eta_seconds": 334061.7818449775, "iteration": 289, "lr": 5e-05, "time": 0.9934222700539976, "total_loss": 6.397109031677246} +{"data_time": 0.0006954745622351766, "eta_seconds": 334048.80579435034, "iteration": 290, "lr": 5e-05, "time": 0.9940367945237085, "total_loss": 6.3805341720581055} +{"data_time": 0.0006954745622351766, "eta_seconds": 334059.78576599853, "iteration": 291, "lr": 5e-05, "time": 0.9944934344384819, "total_loss": 6.3805341720581055} +{"data_time": 0.0006654334720224142, "eta_seconds": 334046.8097869423, "iteration": 292, "lr": 5e-05, "time": 0.9944934344384819, "total_loss": 6.3805341720581055} +{"data_time": 0.0006654334720224142, "eta_seconds": 334057.78968701954, "iteration": 293, "lr": 5e-05, "time": 0.9944934344384819, "total_loss": 6.3805341720581055} +{"data_time": 0.0006375884404405951, "eta_seconds": 334044.8137795343, "iteration": 294, "lr": 5e-05, "time": 0.9944934344384819, "total_loss": 6.3805341720581055} +{"data_time": 0.0006654334720224142, "eta_seconds": 334019.05948511977, "iteration": 295, "lr": 5e-05, "time": 0.9940367945237085, "total_loss": 6.3805341720581055} +{"data_time": 0.0006654334720224142, "eta_seconds": 333993.30533863115, "iteration": 296, "lr": 5e-05, "time": 0.9940367945237085, "total_loss": 6.376223802566528} +{"data_time": 0.0006990574765950441, "eta_seconds": 333985.6798626669, "iteration": 297, "lr": 5e-05, "time": 0.9939206490525976, "total_loss": 6.376223802566528} +{"data_time": 0.0006654334720224142, "eta_seconds": 333978.05442630476, "iteration": 298, "lr": 5e-05, "time": 0.9935331090819091, "total_loss": 6.361267805099487} +{"data_time": 0.0006990574765950441, "eta_seconds": 333947.68301244406, "iteration": 299, "lr": 5e-05, "time": 0.9935331090819091, "total_loss": 6.357731580734253} +{"data_time": 0.0007225919980555773, "eta_seconds": 333917.3117741011, "iteration": 300, "lr": 5e-05, "time": 0.9932289840653539, "total_loss": 6.357731580734253} +{"data_time": 0.000743917073123157, "eta_seconds": 333906.3849010912, "iteration": 301, "lr": 5e-05, "time": 0.9930623305262998, "total_loss": 6.357731580734253} +{"data_time": 0.0007738874992355704, "eta_seconds": 333895.4580874124, "iteration": 302, "lr": 5e-05, "time": 0.9930623305262998, "total_loss": 6.371462345123291} +{"data_time": 0.0007738874992355704, "eta_seconds": 333879.3042220641, "iteration": 303, "lr": 5e-05, "time": 0.9930623305262998, "total_loss": 6.376223802566528} +{"data_time": 0.0007738874992355704, "eta_seconds": 333863.1504472811, "iteration": 304, "lr": 5e-05, "time": 0.9930623305262998, "total_loss": 6.376223802566528} +{"data_time": 0.0007357789436355233, "eta_seconds": 333845.68046631594, "iteration": 305, "lr": 5e-05, "time": 0.9937273630639538, "total_loss": 6.379072189331055} +{"data_time": 0.0006990574765950441, "eta_seconds": 333828.2105837816, "iteration": 306, "lr": 5e-05, "time": 0.9937273630639538, "total_loss": 6.386901378631592} +{"data_time": 0.0006654334720224142, "eta_seconds": 333820.67230878165, "iteration": 307, "lr": 5e-05, "time": 0.9932270395802334, "total_loss": 6.386901378631592} +{"data_time": 0.0006654334720224142, "eta_seconds": 333813.13407286676, "iteration": 308, "lr": 5e-05, "time": 0.9930623015388846, "total_loss": 6.398268461227417} +{"data_time": 0.0006033414974808693, "eta_seconds": 333791.7527548622, "iteration": 309, "lr": 5e-05, "time": 0.9932270395802334, "total_loss": 6.398268461227417} +{"data_time": 0.0006808298639953136, "eta_seconds": 333770.37155866274, "iteration": 310, "lr": 5e-05, "time": 0.9930623015388846, "total_loss": 6.398268461227417} +{"data_time": 0.0007253768853843212, "eta_seconds": 333789.75817017537, "iteration": 311, "lr": 5e-05, "time": 0.9930623015388846, "total_loss": 6.386901378631592} +{"data_time": 0.000746701960451901, "eta_seconds": 333768.37709578103, "iteration": 312, "lr": 5e-05, "time": 0.9930623015388846, "total_loss": 6.386901378631592} +{"data_time": 0.0007738874992355704, "eta_seconds": 333744.0893874394, "iteration": 313, "lr": 5e-05, "time": 0.9930623015388846, "total_loss": 6.386901378631592} diff --git a/projects/MagicPrompt/pipeline.py b/projects/MagicPrompt/pipeline.py index faabb3c36..de840e040 100644 --- a/projects/MagicPrompt/pipeline.py +++ b/projects/MagicPrompt/pipeline.py @@ -101,7 +101,7 @@ def postprocess(self, model_output_dict, **kwargs) -> dict: mode="libai", ) - text = ["a dog", "a cute pig", "a cute girl"] + text = ["a dog", "a cute pig", "a cute girl", "a boy", "a dog", "a cute pig", "a cute girl", "a boy"] output = pipeline(inputs=text) if dist.is_main_process(): print(output) From abc8b80912995a39ea4a4a5925c99662ea86ec61 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 17 Apr 2023 16:42:49 +0800 Subject: [PATCH 21/25] update flow.nn.functional.gelu --- .../oneflow_magicprompt/config.yaml | 77 -- .../events.out.tfevents.1681720413.oneflow-32 | 0 .../events.out.tfevents.1681720458.oneflow-32 | Bin 68200 -> 0 bytes .../MagicPrompt/oneflow_magicprompt/log.txt | 1085 ----------------- .../oneflow_magicprompt/metrics.json | 325 ----- 5 files changed, 1487 deletions(-) delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/config.yaml delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720413.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720458.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/metrics.json diff --git a/projects/MagicPrompt/oneflow_magicprompt/config.yaml b/projects/MagicPrompt/oneflow_magicprompt/config.yaml deleted file mode 100644 index ac1d29051..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/config.yaml +++ /dev/null @@ -1,77 +0,0 @@ -dataloader: - train: - _target_: libai.data.build_nlp_train_val_test_loader - dataset: - - _target_: libai.data.datasets.GPT2Dataset - data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence - indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} - max_seq_length: 1024 - name: gpt-2 - seed: 1234 - num_workers: 4 - splits: - - [949.0, 50.0, 1.0] - train_val_test_num_samples: null - weights: [1.0] -ds: - _target_: libai.data.datasets.GPT2Dataset - data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence - indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} - max_seq_length: 1024 - name: gpt-2 - seed: 1234 -graph: - auto_parallel: {enable_auto_parallel_ignore_user_sbp_config: false, enabled: false, sbp_collector: false, trunk_algo: true} - debug: 2 - enabled: false - eval_graph: {_target_: libai.models.utils.GraphBase, is_train: false} - global_mode: {enabled: false} - train_graph: {_target_: libai.models.utils.GraphBase, is_train: true} -model: - _target_: projects.MagicPrompt.gpt2.GPTForPreTraining - cfg: {amp_enabled: false, apply_query_key_layer_scaling: true, apply_residual_post_layernorm: false, attention_dropout_prob: 0.1, bias_dropout_fusion: false, bias_gelu_fusion: false, bos_token_id: 50256, chunk_size_feed_forward: 0, decoder_start_token_id: null, diversity_penalty: 0.0, do_sample: false, early_stopping: false, embedding_dropout_prob: 0.1, encoder_no_repeat_ngram_size: 0, eos_token_id: 50256, exponential_decay_length_penalty: null, ffn_hidden_size: 3072, forced_bos_token_id: null, forced_eos_token_id: null, hidden_layers: 12, hidden_size: 768, initializer_range: 0.02, is_encoder_decoder: false, layernorm_epsilon: 1.0e-05, length_penalty: 1.0, max_length: 20, max_seq_length: 1024, min_length: 0, no_repeat_ngram_size: 0, num_attention_heads: 12, num_beam_groups: 1, num_beams: 1, num_return_sequences: 1, output_dropout_prob: 0.1, output_scores: false, pad_token_id: 0, pretrained_model_path: null, remove_invalid_values: false, repetition_penalty: 1.0, scale_mask_softmax_fusion: false, sep_token_id: null, temperature: 1.0, top_k: 50, top_p: 1.0, typical_p: 1.0, use_cache: true, use_scaled_init_for_output_weights: true, vocab_size: 50257} -optim: - _target_: oneflow.nn.optimizer.adamw.AdamW - betas: [0.9, 0.999] - do_bias_correction: true - eps: 1.0e-08 - lr: 5.0e-05 - params: {_target_: libai.optim.get_default_optimizer_params, clip_grad_max_norm: null, clip_grad_norm_type: null, weight_decay_bias: 0.0, weight_decay_norm: 0.0} - weight_decay: 0.01 -tokenization: - append_eod: false - make_vocab_size_divisible_by: 128 - tokenizer: {_target_: libai.tokenizer.GPT2Tokenizer, do_chinese_wwm: true, do_lower_case: true, merges_file: /home/zhangxiaoyu/magicprompt/merges.txt, vocab_file: /home/zhangxiaoyu/magicprompt/vocab.json} -train: - activation_checkpoint: {enabled: false} - amp: {enabled: false} - checkpointer: {max_to_keep: 20, period: 8000} - consumed_train_samples: 0 - consumed_valid_samples: 0 - dist: {data_parallel_size: 1, num_gpus_per_node: 1, num_nodes: 1, pipeline_num_layers: 10000, pipeline_parallel_size: 1, tensor_parallel_size: 1} - evaluation: - enabled: true - eval_iter: 250 - eval_period: 4000 - evaluator: {_target_: libai.evaluation.PPLEvaluator} - global_batch_size: 4 - input_placement_device: cpu - load_weight: '' - log_period: 1 - nccl_fusion_max_ops: 24 - nccl_fusion_threshold_mb: 16 - num_accumulation_steps: 1 - output_dir: projects/MagicPrompt/oneflow_magicprompt - rdma_enabled: false - resume: false - samples: 40000 - scheduler: {_target_: libai.scheduler.WarmupExponentialLR, gamma: 1.0, warmup_factor: 0.0, warmup_iter: 0.0, warmup_method: linear} - seed: 1234 - start_iter: 0 - test_micro_batch_size: 4 - train_epoch: 33 - train_iter: 10000 - train_micro_batch_size: 4 - train_samples: null - warmup_ratio: 0 - zero_optimization: {enabled: false, stage: 1} diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720413.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720413.oneflow-32 deleted file mode 100644 index e69de29bb..000000000 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720458.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681720458.oneflow-32 deleted file mode 100644 index 26dee87e72ae2222ee52f685b43005f847c6be94..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 68200 zcmZwQbwCu~*9LIK1ZhRF1rrr4Kv`o4MO_sUgAO|oyAu!_Ma33D#l>#@+TEf^h@fCM zb}RNf%lzIs_nABX{oKQRJacDfXYS0}7LoqxBfHoe9}!O5yIbVCS2F82rf=8?r}h&D zju@wD+GNbec2)nE>(l5#P2=>AzYBBqtxYUVj0f}`*SGh$u;Bx3cAi+KiL7(ox7z>0 zxp%Fb8V7p*FI?8d%EV;c$Z>s#^&U2I>{y%W7aZMtJb8b){QuJ3`ktz1T(kTC(q&AH zO^k+(v8iyldyCc|SHxETU#fc-4_o7as8%%%8){ipxe2>3G}c&Ox;MJU|KbjBw|N^+ zoFT`R8{ryg=e}7Na@zKP`NIE)SCnTrBkA+bX|NexbEiq0VWiv)!~aA^;QFPo8D;b2 zxNG6TiVjUCv3+U44@dlxD<&%=Im{QnEXr_6z zaO4XRzM;5MIL)TKyR*IB9S{apk;}8f*L}Oz2H`=CUDD8kf7lS}prm9?#6ITAM z*QHMyJQ{?RO8k+8#g)RSD%0G0xIXCt!jBYJ3g5pk=WaK&+F}p}l$6V}!YV!m_d$4o zawI(O5v5)4{A(8oO-jiXIidEoUbi}`XBQA|%xzt(aI%z83TxGv=~lgx{sRaj^5nQu z=p9zp-TjQqD-gb+JS(jBvTsWecDf_iC842*XLC*s0pSKpa>6^m^t$`?2bBcj+rQH# zVM(R1bklimbAw)#0Ab@I5&s`o3imy>a@S@!&jsOm%Co||kA_YGVU414T@o64xS+^e z0fdt%$qE0x((4)x-1!fL^&C%0n_NmM44xk2w%f8wJrLffxKh}6ft7nm?M5CT?Bpbu zXN4!K=ky2RddiVd*DOj~u1?Bt5PqU0C!BXvud8VFG8KeZ`~H=LrIo_HR~EY6Iq~@< z2&eax<4U2Gvz7a&@>`aI@HORG;h}N4fgtSEU#?5SPK~3qsa0c_fp8rqIpK;7y)G;v zy$%Q~oNiOQa2^^fh55C%x^1cW?+FMk&&YA5(EXl;dqh_D2M|uBJS&X;(a;%$Hz-HK zIhrW#qNbnfg0S{kc?&pU%ssuX|8WN$2uID0l!PWqVT~E*-AqkgDuD3wJUOluj-F%g zp5WCd7KB6R%XwBfA%8{$2)|K|gy+S1IR9*APZ0XX$Q3zZq&N?=o((htVPE?*lCX?Y znBafGEpXo5e<18$Q;sWzx=Uv6cXv%U1K}CUv%(=|Pu2yYZ7sPj2@S{6RY{YKK^R6! zPFPtymd;;1x&VZOw->2XxR;tLg|>|^y7^XVx*CMJJLI@hSgoX)dul*vUl4ZRDd$lX;y zHk0E@;jwe3?%(1a2ZL}wEkIgB%!%dI4SG4+wRfBmw<53R5`8``b3(#@4fZl5C|(ylk==_`Mg9+5RRlA3G0YL zuc=4&K@Trbk`w+F&r3_*D!&hU7<=ukB(zWpZ(8TN%~%mU3xp3-<+xIq@u`gaLEAJh z5c*%2^Q_RRrfVM%9;F-!7xs?QzKqDQ1Yy}Uxgsa5BnsaiDQ^nGeO^X&3+JJwQrNgh zuG_-EI|o2`pqm_53OxsvakuYo(;9^K-g2H5&K~6A0m3PiBcY*(|IMA#3xwAx$q7%1 z!b<0U{Q}{O$hMNuN-6x}|Hf^8mp3axxGhSKD}@?xfybv74ECj$`^!V zDMvy>5A*KSYy-lpl;nhY;*mxuh#UmMu}`BVp|w(I=;8O~0d^oveJ011!k+(3+z<60 zTpWbLb2-lnBf_)KfN(A4NNDKcxR$#ng770HIpK7%hkv!}W`gid@OepCRw*>}P#@;< z6@-Z)a$G6gdep?d^P$w*AS@Ls=UL$}^SFv2451teBl<>ZkIb$V0Kz?#tLV`;4t znU)~j5ml^S;XEv-6dH~+i_fjV?-oHbEBqz)JwZzJqWBk)T}+a7ku`65g7EDQ4@p>2DXdZZpId&ajT;C%?vmr2 za8e2Pq|koPKzN4otT5%{zZxL4-7VKe;mq8(7eF|ilAJJC6goLS3kTsa)7g@+lA3U~ zUF9(#oMI-&mBPk0CEVYP|5zD>e<{xjbC%^#0%4fBTo;Aa8x7qI!t0degkO!5b$yyF z`whZ!kryRlWu@?yi;?DH$qD5_SU_>5u;6iV_j%LG;(6(aD7ic<4Bt7lD+sSqj)Y~x zqqP2E!}f!)`V6@uCtM@;@ay9Bcom~aW{LWR^RSAVFvP9hY!I5?kmE|>gi*!ZuXacb zfgVOuo)sS7)HDx-wuEDRF&gO;gt(T-K(A(F&Tt!DbEV`rbOKVVf$)wT@=1vxN-yt<0;7r1H>MF z`ru#=!Y(uBNWyALpvENsKaH~zJ28HvmhEizw zJY4yEg&Fj4eR(;q6gt!^;vV1r$8HdsRgm+nuuqG%+dw#oawIGk6r~Niy4nte$0*4O z%ZkDVBYxtwwho~kC83>~aQ#UWcMuluEytC@oxy*FzrBslfN&(`S>e&W{YrrF66Hv^ z(^|AkfaC`1$Nmx@&=eK-jVB~aQqs(R1ii|k`w+A?;n;tx3mff zE8Zy8uy7vMQVN}Ce{ribX-xD(&AvYbQ`+@kUB+AhkG(6EU5(tYPmn(9@*?;xAgy^p4 zKp1)9iX^P7CM<7rVGRhoU6kWWVZ7y6Au9Oo91xzMJS#NqSZxIe%UzP|lF+c1noVsOia>B`C57YX5@&aMEU!@%j=V3ji&@c~$uw$VhobX$YD}`4cd=YMUIy(=9`IKjc z^?ogx4Z^m6(6OIt)VTswx@<2GgNhe9@pcFoR@WU-^O2`Hf_H8Q1 zmBQ6FUxe`IGr~ZaMtN2kF?;YB5Y}oY*Ck=&4pG`s2Tpwi;Y3Pu!dC@)U8ggDwt~>x zW4ypqg54UVM6A!}vl;nil#2(%~)Yb-sb;_7H70$zkN@0fnyITj>Rue(!V=Bj$ z!j9qZg+$L~rXajRc~-dGY2pbG+MCIBNoe@A*xI)(UQL`#Nly6ohhF#5?d&=bKKAb{ z2_4mhB`hM3fN*Dk99IfWI=mCw?5a`>mclo)sRb{Q4yb$5D=ih8{Wwt!oZFyhce* z7$)}cj@uY}5T*~0k%Uf4q2UwX$JS&Z2p5cy<4WO{CU1pZ_nYN`@E7G-VZR!V86fO6 zQm#uv!;xljLjRW_+(JoC`0SfrS9_{iFAzQ+e^nATQWHM&sy!Tp-zLa$rLaZ)H$v;) z&fh>7Hc`&A!U{)=Gy&mR%8}5^B}&^bd;oqPmYXD38tJZQ??~?+*yQYF(3r z&6L73^$XnEr@zhtVUOB!Tq&&Q`cg16bGiY-^OR?WUwW*=^)yxM$aP6**h}Ne^lS#g zWJ+?vKOgkE1E+7|cZe$;j5Ox|amuH2`HoMJ%y;M&*66RHn(tfWRnF&4oMM+NBP+avDXp=GrgsUJsQgw2&g-~Wnf4lT&O3O$V3BFB}&8?_$^-De%H1j6r>XN6Zk&Ho6( zZd>KLB>Z6;r460eW+@1hD9H)mywvL|Z+8m-VZ-NTniS5%7HYzUWvAl(!=e@D7;_!UdG%gfsK?x)CL>`hxJW)>9I?DTOgl zN@`5kUmFKKyei0XrLe{I`@+8RKX3(Y3wJrs3Lm*thymdW%Av60fU(0s_==L8aHrVA z4w^F;K^T2DVpP+}ClxKz2UD{p%VZGCGT@-qp z>Aw+#Qz^*_Z;D5n^CK6li^7;e4gZ30E+skPFtLa0`;8m}!pHYan-zc(CuxQUXS z@ZU4NuIHPyg9aKc|t#OtLg7q@}1y61ICs8tG2Jujl^d(pxKge$wsaiy@zyBy)S zmls}5?BgZpS>Yr5cekL2MZ3v45>_;i(oV`giSuwRB{|_$QP|+f6Dtt*jj!LVa2^Uu z;fa66G*`Q=SOY@+S~;#1CX~+>K3`qd1$vmAAm>@(*n)R>@5we%&Y`erkwbV+6G=%< zcw7`t6}%pUu;lP1lF(gExS-DFnIJ4bLXIniLmu7`#=3f~0^voX1D9H(zi=X&YOIEt6tZ%diO*Sx)$|uE!<0WYQi?pe+~v=l#v`)6FzS^^8y@c-cz0xZoE); zDhPGOsh0D zVGlbig=enS)y%K$vkZjOddYF6FnG-k;gMy>*&uZHk@Ku@*vt<2gyjLsk#NA2v)cVD znoR?t;44?;ghj=Bz%H@gwV{V`(X%CC7d7FJq$#gKIDMuZR}*eJ_$vs6Zz#_Slm8ns z5PH~YmRy&FX4Wa%pwdfDf-sSioN&1K{Ncg5Nw`*D+y0d#^i&gOw%F?n!V4YbxKh}w z_f6qPN{9mpdvuiZtZ=KQL^TkeqZ|oG^i0v#*b_Pzgbj3ZMNXJ2u2i@(H)kO9u+^Y$ zEeiM2u1eu2n?{-!{;m3hP#7%7mBKrhvV|f;HsflObChR=v0rz61EJFpxh@GSuS(Gt z`xcJt+Llq06P7EItQ&BnYcL4+xgL;&UTQ+q{c*)XxTv`tR|+RixFuXJRUgmVeo~$l ze!0_kCeKfN-f!jw^)`Hy;ZBop1Ud2&;CI^Q`cgXZv^%&ZQg)i|71_6;j(3VYaHO*m@%T)acN%~6gkh4;;#3AHpScObG)i*9o1!q-Z54jvFB%>w3417ogEAXyVzPd3g&xiyA;*=%R;?|^ zmBOtdAA~P2oo=F#@~rT|G6xqBKBpWB4WIZ80xj~Ohpp?#Tfhmwi^AapcPs(nB<}=C z=%*&sIEHlqp=WnFt`v5#`zkCdUIy1D9;G}hJUQ`dV-S|_A=f40$+;=o4Pirjf-szt zoUp7Y9AB*Q3lNr!xho0%mBJcPH8ll2;_!)*dlXj_zPTBKSM@tCkjt||UDLcOAlyhf z6h6ytx&eegD9H&MiSzLD?Sy6^+u~WU`ZIL6nbUU(9{cy z!nZ~Q-IC*KLeGx%F2Ov!MtN4ae8OZu5Z1aa*CpX5%@ys}r7vuthZ88t3DZPj)n8|$ zK=|=Nf+P%53g^wLrs-5Xe;V{~&qFz`CR}d6)dqw$9?5xDn38vYGYF?qj)W7;lC|Y) zylVo&EJ||1p5jXOn~jbHfw0~CyOJXIgEeLa}R&8B44?~o~FFBPovn$)@g79QD zIj$zmFGwm5LWk;do)y}kpO6j08I&WT#nLG4h9%ufLl19Kk`p!*SI|b+Te2F2<9$LT zVW?8*nO9yj;#2G^5Vr7@<7&b(CH5Zx;bzLSLdSEn%YpC@y{v#_U5i6?4u^MwcC{mLbtbaTuu16M2l%4 z+(vm;=sk5LKKEqwPOeKr$1378q}r0jKSP(ENT2nu-d-?pIytMJS!|X)}R>luwk-X7lm1s%N_#Z3`%lBXR(Jq zgLhVh9-d7Pk%aw}LeI9vHG6+|$7|jzGvv5Z7}foq@T$}tT<2|+Dd$;X^YuICgV3LH zB&=69N_(J;`y&wUpd=@B5PKN9v*%6_uHKR;3HvLBf6o=uY^WJq5_%ZFRgNo#F?)g?-fq;F#ogOeIJBwe$}*v^KhV2SnI$Kx8Bz8@vRZ>D6SM%^n50iyVMn*#}DzB%d^5$ z*4^;i(-F#%aAt`p?bC5Z?t#!EK(5FM2Z}w^ugW|F!b1Z>CE*~YFx9)j&28U;!_dPa z1Le3<_`b_iA;;2l5bULylxKzOW^0eZJZv~fu1mu0MWVFN;)LfQjHDzdeDq$gOZIDk zZ-#6exK0udRtkgTUb}67dA%kGzfoK%98~P7@HS@95fJtZlFPHgT1PwMo05-Fj)cKq zBekPzMIHd5d9YlO6HXRa?p@ z>|v)8mn*?M9Mz<^Bpj+HbX{_C4+w`emE%fbsr?Uykec&itni$W&<%u*n#pxh z*eo-o9SCPpk`wk3=V7D4b@2^6`myUJVYpJb;oU1YTgwYKLAZIG99IfG>pc)Y4lIT1 zyluwIc~%&{{X0HyF@ka=^n4Jh9d@Z6zMtkSB{|^_aka^+>D^mF4~rguAPI-534fJ8 zi03q4D6SO#xA2~zeN)F1df4ZLT%Hxa_FaxEXpd43g=S7B^`VCrC*_KqFz%^dyt|fb z1j4z!tGgG@!{KVe=*dO!o$UI)a$G4)3AihC?lz(t2q*ND^Q^Gayf|DFW!qoQk?_K? zNbSV^4Lm@&my(>Yp15+a`-U=jC93k;-jZ;HQaH=uyPLz>BwW|lG(nClh4D4-2!1KL zHXzhfo)zBh`4c}6|4@#EhQ0Jg%Vobn*ey}s0#115kzThpW-xwQ6j`!f5{^_8)~;^j z4fC*LoE%pQT`cm1ZmH|>4r%G7a-J3X{R={&o^mK0@?X{CAZ)cvuE+^bJkaYJ?|n1` z_R@)|4VUTcd^mn(9@BjVex%Whoy1%w;dR&QH44@WD7tDSzhS)~=lc{n~njw^-Uw{nHA z8M#{6OCM666}sH*jZc)gCCYV4XxK~Z>^(aUdbpU9oN%rv{Jnc+4hS2b>MaS!D22Tm z{&ahHv<`k+lsYZPl|rYuTw(sNaJ<$wlJcz3c}_cg>-!Pp@lVF}eZtuy)b|NjOd^TzRyVroz=*B6MhkYIUr!+=F`x_mM=pk;RH3|T(2#MKp6E(jw^+Gqi+irI#-N<9{!{} zD-2oFJQRd6ujRTZEU&wc?@fF`Nly6Vs`!qfGoxpN@LKXZNjOm{v|eqh@u+eGpCSFW zNscRp_P=fkckT?t^)%Br%XwD#^LTY55Ef7lg}&`Zm_ZK*Zjmc;!V8!5I_;}z@gO`M zbYBuqQVP%gG1Cm&wC)`AFeO-yD~07lZwaNdUGcpb<3i*-D{K|@?=J|=LggF@v!f!l zPyhbx4Z=y32;rSi{l#M9YQsa!g)AZDU2|()C_+YV-Lb6ZRNO{@cYjnJzy{W zOLO#i?(*?G2t7UIBqywLT71{n?&&em!*MS|B;gdLaA{*} z%^T0=QNlv)^ zxL#K#WA$JVx;9uR38yNBHGbG=W^9Zt146Tga$G6A`Zq`T@pHEe2qP)a3IlapeuMBk z8VS4z6_k83+p~t`zRRnIm}E zPwxf7A%o=dtnktJ4`v`dOF0tOcpjn6?|d1rmzEnWSLB34#U7q`xFZsTt8A*aFPw+d z)r7vc&GDQjy1X1$3QaHO2t8_+C=Ip(b3pa-t{nF#oa~R|?&S^^Y<~6R!vlSz-smQ4bR&VWgUHK(`2d6V|9la$G5_@h)5V zrmuyc_#Y|H3a1|Q-wDFN$8ud1)|%52KP}Euk`vmC!U6BcmxZQe{<|v)qqyJjGaPCT zEY6+*|Am=F?#X#I;rt<%SmCV+aH9pS-QCYM_ zeU1J9qLZvz|1oZ0L^V0)rUT-MjrQv98vh$n_|t=H)2ehRoQyM6nN@1nH3#ODbjmB4 z&-1c{UnN`D1m-Jd0L#?hU2FwRj|{3zOrxa{+LoUdjR)o`Cdx4b_ls{(bn)s1%*Lfc zBxW?noL0h86J4Wc1z-*?O?f3Vpdef5|E^~yFl~%!0L!#5I^+g0w=qFv7Aw0U_vv*xCvQChre|7$#GJ`7g9ln`+#Y)1b5u+0&e>!#h6n6p%w9-DXOLpPsHr@R_- z_r_HlfVm)o2C&RE&oWy9^CuG|=B9uMZL230M+0+UB-P}YZA50xx05;oQ$Mvz$HKWe zTb0>m&8MxvtTUbRYD~Y|)xQAq5Ho;f`s*_*1G7m4RYv9x&r0FI+{i>ZX8S#QU3gbp zyiM_WNU+45qslz)k%kMF?1xfb$!wb-&dt2}SAcn#8Nf2bzUUqTvuQY0MrMogy=wzA zhKX{_uDiuY_UHP~gKqZSlprzZa?GK9Z8dvWIpLA^_-4whG5y!}=nTxMTWA2w%&VIh z0nFD-keG(KX8*&*g8Vk1I#)WG=OFHT3`A)Fe8{CF%5IGL-nKpU}iBwO0A}Z#v=JOLZL40_ z)^s(#Dy#a9Jc+qPm6_$4o(0UtS(H~}7EK%b5ST}q0W9<0W|ytNY;cn*6VuR5>m$4I z5VV7da!ji&dR_LhY)k0ouJ}rw3+HAW$282%`UPj*fq8N*<(15*{ci|qdskfm=BNZ3 zz%nP#ZPo~wznLI1m)Op%58WJ_NHsZT>CJlG+n!&%UQ$xZR}AZ{Dwl}y9jv`mSN1?E3y0L%Q@-}5gpL$_09WEKRLI|Ix#Cdx5a zh|KUAJ+=XJi1{6fxtwF>?Ql5GX-CS-#c{S#WLG9iHvxOxMV3`kE&C>#NClf^G zoFNYDfN5(*H96*Kk?C)KxdJe)T2}5-I5$^tOy{qiGWO)RRw0(jWmE|#^qe@1kB@1keG(K`7!=-9bnq)sV2vKrPu4?O;_Mr@vZ&i zCFUxQxnz!@S>^q@?E?sFzXMb0W5Q-{V@E7vWy8Lv+?q2Wr6vfiE_*$ zBJ*osZ(L6O+%!*OuI8B2YBtt5dOgM^iHFQ6&oM7$3im%;hykXZISpW$v6=0HpqrDK zATobgXqo}@Dih_HHXHT2-0Gw7`=(`TMbE;yxrSo~M>=Xs4DJ{N-F(O7mCP@FG6k0} zoy>vR`#Svrmf0(}Y-eB|XM)5uoF1&Xb_Bm~TBlJ>j=6Nb`0MVa67gnF`k4TU8L!Ip zzwycwn48X0UX8hb%#vNev^+-xSZ31K@D;!u$^?mNn46*ho}Pnlo?xOJ(>zhH3##AF z9S%VjW7bH_wH$MMpW2!;zt=1W=36GOWHxM_DYzdV-UXO}W9bjD%zt%fF9GI%Opus& zc0_3B9Iw_Gn5N^XCdZ78*Xx?MdxEbHtr(OmF%vlE&z5$YGKn#G!}~jvS7Y9NRkp-DZDwJycC$p%m9|T zcbCaOU^)+>%E&Agm+J%FT*O2>Ue~fl0e-D+;TA73^&In3>OjqqL9TjW z9?(!;jal0FQbS-4X-NZEW>4?z)4(j#ih{&6bknlS<43?uW}+O^UF_zk)0w#TpwZ%7 ziJ8nXkIfpRnbn{~95CxIp}ZRNaJl$)z}(3UV43!hI_Cq^CXOl-)6mUgj(z-q8O207 zW(l#I`9p()VW*wau41>sxw(mB&dO`A`FQs^{)$q72j!Ja?@Ad$-K}YOoi>FTz%ng* z+{UZ!b=p&9WS%X#uLv+_Fj0~M(rin6U_K2AkeHh}W^h$EO|khy>p?efhEiUQ z>GFHsZRn;?ZyLZdrw?g?cf?LGL1f;a)I0{5HTqCZjyY9iW;HC{4wxyI*GSAQ9P{0$ z=9=(F_xb{J%N5G2F}3A2jeu#BLIYUl>_z=&!%iE{1c_;woA*YfJp|@yCdx6RMCRXs zp2dMVq*1QK+{!TxpPMBX-oeKhM>eLs8naHH`x;<=VFs|wE)7!e12eP9Ze_&UJ6Jug6FZdYZVJf9g0%tP}j zuVh|L%n%l&#C!*4ofsOxGEIu^%K>Ht6C|c#r?veR^#HnglZkT7&?S1EcVMM3V9va> zMq=*Zm>U*5XeM3p#<%r%zD#)~v%X`d5U%-%-!~6316XF(+=N5GG`&KViRm5?sV#TS zZ8k82nJC9xD0Z`IoBH^$|)x>VDuOHqBy7{ntrS64ua~H?FyQrLI z=GZprz&u-l@@mZ8i)?_!6AkJt?ooj4)d_1DG9q(Eyei-_rs=H`g&iVwOJZhUc_74=eR3 zoSXYpnRc}Y;X43cKBBxD)2L64pU}+(k7)qQT%L5L1~4t3P>`60ZWefs*$m7$Cdx5W z#Y51OACGrJH)|XSl$iTfnK{iX;CkAi|0u6yE?Su>Y#rDB5HO7o(g2qE^uO0dfw_?h zBJ<1Anz)#)*&(XQF+;_Hy)E`55j0WghPQDtP>uKZC8nEp(ZWAEYrD9y98hcGC^c6 z?t8f|Fn2Rij=4EjJPXY_p*TOFX({>i+)tE^gM(qUVTxI~v9J(#~ z4KN=vL1G$?w2P~|4h3e@*|ZTH(|4iajVKi&fH^Gqj>J5o%B+~W=?O4Tg-~9}9I`r7 zD44YlyV*392Cz(y_M;4Sa#Rz%@%JSrUSb~Sm?NA`Gz0T$;IE~cFQmMZIj>Wu5aMKH z1>M}k3}BgKkA2JprY4pu6La&LNUg)R&p0>FGf|FtYMx#C=JLT1w)pnl6@0(wk0W7mfn@;_qn^6T+nV2`` zMQX3y@C}2V*5(IAIcCo}dR@xTvAFi2_09l^d5UBDdYWtg%+2}-hakIMlvgs7#9Kwh zTK5TuZZ2mAu*}|5$GijPJ0?iXh8~gHPVOi0>Vd~@+6a#MewKL2t9RNl=;rYFH4^i* zD$}}YW*cDY*HT`|e6cMHpDKPyKXaLLXHZZ9VFi$c;V*dOUp*=U~%3SECMIzPY zm=|a0b$fIp9so0;O|HZ|!!fIkve8s()1x*p4{0f{WcC=JAw0G?`wp0H0u5l9J&f#L z0&_1DB&O^62WS4H2#xp|gj-q=(_Q>oFJwZNS3 zLwP0B=wOD>x|r>6V18i+uuQMr)#?J%)0ZkE)5FaVPxg|SD8~$(uGf7T+u9krIr){p z#5||UY&o&>ePE7#O?f5LFgNeGd)@%%b7la`?6|Jr2{45>RGFBmt0S~;Yi?Qw-CWK@ zIcCLa;=@yCue}6j`=zTT=6Q};XRy7-X60{OHav70<<*#X_s_TwO!MV5fMxn?rsV>2 zEfXYWv~%PZi%>{LvVHs%Bj4+Y<8v$Mo&cP*d;C-UiUkRc$D* zWEytbP+b;YJ!q+=0W8zL)@o~DUS@*CtUW72d$x4=d|-AJs3ynsnWEP@pFM?4r>7PC z3g_lURc7@&(MnfA}qg^+WD7C|>7m;o#^Gt{OGFmEwIVxI05q1`Zc{}^C4 zeoh;~F&9r37lGe0*#JB3>Aiju^AgAG7gkTR%3<_6U|!lsc_nk}f^;GB-lZ16?6#i< zu*}uZTHOHVQzl4E3;zgh;ATbZNh|t;uwZc1M8<{A_Oc0s+5o;#{ z)5-pp#Js{W?Poh`avtu$7tuDUNqIHqgxqjD=;mQ&0Lz?x{`@Uq+Sj7W#B_8JpWiI~ z6rU=Z!$djeXpvd%VSW59sggI#`xnm56jkP_&_=~!r`?xLc{S$VcA4?OT$DotSZ4D( znRpvF@D>G$`MSDz7Ubtp9d=sV+Z5%P_a}%KFl($Y2HpI*!dGHm<(M^IHr2Ee^uvJJ za24g1%!J9}At<8!CScxT2C&T4mSwL(H^Wy`Wn`B7(6tyaE3Ba?$Lum*yj8T$Z2~Y~ zw^}JNuW`(8^;>9~Ec)^nnE9(-*=dAE&y{&uN;Y)s>-Z*@ndyhmiMK+8Z*RWo(FU@lNrD=f7Y_u z1I+P$R2i953;u2ZW>tTRa?GsZ;+4JZ-FQ>pL03MYaBf~#WiG1m7H|CR??ibu=7KwG zngTPRGYw#wb)TQd^|TebP>`60x%v9uxl~|oXQCW4beMSRe&!@zj_&!wM`EU_GOa$( z#RnVx^C_=n8s=v2CB6Fq^CmNZWxlNad@6KPcuAF!xnsoHL%`g~L^z)qWei1KR8KfSv21ZKO#G=OCmw`o!qnDI;ynb`{#1_ARk z6Xlpk2Z@KE22-^#H;?VjmY5l;%sUxV@KvC9_fcNS9CkEa&@EcN9GC(7X#mTt>G=Ew zFfTGeVs5J!q5WxhCli=W4^T~xIlrG?S7&*Nn!wcM*#s8O%}kDIIMR;n7rPz08FGj6 zYRroJQt{!bTxI~vtZ?{yabR}3OO=UPrHOcYaIYA?dgU+^<(Rem>veBCFIf!CjdOh@ z<_%S5T*?Go(m!<`<&{js++3Ko;U;wRF*AT=9v!|n0+=o4Q)Oc2){W2}3eEol-JH)v zIc6uZo10$_C<4s0Jy%G~ELG-TO=Dbp@Vpo0mCQ#Y(uK+|1y^7W^q~PPbL-y=UxAs< z1d-YC*$?by3ty_qF|YK|>$>=yuMW(xe>WxOO^)eh+(h%h^)s#uT2dri&MTP%>ZJ=e z{LCu@)2S#8V3~vsX>X(ouwE4`cH7qOv6RFh+N>@B|Vx%f}Khf=$sTu|ZM%;uO+ zgPk-_LYBRTxmo-N<(166mC}VWV@5^*Gn^T~GQXEH!>`rnm>@FmKD;prc3PXCv=JP$ zc%WW4B7Smb=;no0y(DIiDs$K2Y+OmQbv5PHnCrS#Dho`jH8g-_#&2r29lAM~2_o~` zn#Re%Jj_Hn=Gk6)UBz<#Zou3fv|M7|Qe}SM>Cz0k`81gFO6EG7bRp;6+{?i1A3_6I zX8CKrmcYzmg2Zehe&5WPbp~IH)Gm~2a?CD0^}4BkXYq^J-)1)@=51Bx)j#*}^uX4I z@@mZ2BNS^7W-tR-rt8YU-@v@b1c@2ZF+!V^xpE40)611Mf@9w7uGiV+S*?I>Zg^id zxNvUfa!kV^=#|&GhS1FwA1JS6p2|)W<}K_#9lBZeBMo4gefK?|4$Mdo=32n@1-slbCl@nfcGgeuQqmn@D*zW-E_7 z8Nf`OL<3mn)6kCo(9KShDM-w4*9h&d@!iv5r#-<$IcAKvUKi+45f`(SFPSAV?{Z9! zUfnc~59aLyrbQ{rE15}gX~M6`D@}m8kQu--9Y&7D)zM#=ATnnx_QJ!M|qT2V?Mh592c7Qze599=Fevx@oV)fCP>Wf z?IN@XJKQJ--E4oCYI4jaBD1Y_@=E5wIcY+A z-+yhPn-}A00Lye-d=|Sob1emtS$_FZTVT2+P?Td17MVkT*~J3$ZPBF?^MNY!O#9Dx z8~2G3<&{jsPTTQcX?$`as2B}knUM`sXTaP%#{`i%_+Rhtz^qxEYI4j9A~V`A6IYV7 z%*vFQ4^^4A?g@3En|*FlUdcR?nkE$MI10aS-eU%^%t76n?+2!LHdRLEpIw`G0rL_Q z<(QYe#Gh%o+xQN2bLM!f(89U-NR|1-?P@466DCkz$;=nef+}2T*9Lal$`fe-%e49X z6Q5jI!~}_HIMS9c6M*Z2{xDIF+0Rpa6z*S%9AIXb>Mk)KbIhSX8fiurxw8|PX{9Nz zWE#4;w^{vO!0c#D16byW>fwRV&3#M|nTL8*#z)9)OsFQu{L@LVbN%FkuM^DpZ;8Zw z!ZGt)oHZ%w$L)Z*{UGJlnBO$jZGdTghz79CnJs7L0do)&B<6ha%3hfgtA@i)dyI*4 z%p9FwXK8-Y0lGOjJwswXRb>X4MYjZIuMEnoG2?3{h5+*-Gk|5f<;))mOt(y`jLhdn z4s-=(5)(I?ty}Tvnb5-V$#-%<% zH?w^yuf}|5{U;Kb5Bz8V%RF)D#cE)l_opB+4Lj|GBU9agnGirxj=80+cxCUiDX!V; zRc49Ae8DjdbMtZ8^}m4mz?AYz=B06I!tX2JIsh}=j0UhwGm}GjfAawoB&MO8Ni8Pi z0@KT!YI4j~?s{FXsn6>}H+|QoOU!(Z>6zX|Gvq}-Tre|eJ>}Jy&HoF>mvvO$Km%B& z(bCu+(9I1@5SiU39bW*eCP+-fPCMv* z-TT1oU_muGW|!7_UC0lQVZd}wTO=`GbIc7E{WU$zEk;8(how_qjTxENeLQsYBQtBQhMjhN231C8QI~(Az)WSL9J8p%jLhov7MMmI(j?{^Rc6*HXZ2?u*_F)r!@oS3MNQQ!%pip?8*vYzF?vp^KdJ@F0NJ^{J!bi*1T`w+gEV^G z;D^eIpzmvy>8*+Wc+!^DHdrG^8?2$ zeZH+`c;89E*iB2yt1*w=b;NZ+mzV)8b5^(F@xZKXMU{zZ=;rpoQlo%5jEQp0o6YsQ zvolj>LpOhRHSbqAH$SQ}r#F6#zyEvGi}Ffl`_I>f`oD*Fgl<;tMgv&pCX1OzfjO25 zBJA5wYKsUR*S|l+)bId{I+%#0a6ue-9Av)94vLNWc)Kwx$mK?7K3aHp~O z!t;YnkeGquPfG+vJjUNcsW_5qa?CLCEa+C}HzvSra;cld{HDrm{9`$;r>%CG@=E57 z#n**tWt!qm`6bK%mN~dyP&(|i`Am?Q8)k}26Qf=)fo^K9&_-~~E8?m9yO7iZVA`23 zk(l2(rs-63&GpE`O<||~%;eRWRjSvl2HgxYqd&khPfcBqzdW^%2@=!jU4*vx#3uOe zXd`o~$uU=J^}2$WC(i)$qF08*EKp@?Cw;`X4M%sQypq{!=yf4^Ro*#Z<}(9Wrc;SY zlcAffys0uW>*f@6gl@(%QI0uS{Jz!<-4ox;=5mSh zN~ZDT>%zUJ#Y2I)h8e&zZ?BzJ9+)4PATe8rPvf4bsKt}Lc9&@*IOY!VB1zOy_hZ0p zx6xZ-{^XdO=N8deFSLz-oz_85c_s5g=yky>;&3xyE@K9;%(UJGt%3QP2_n;D#);j~ z%~r{@5gfCg*v+h6b^8Hx=YLBi<}X#|mfE**p=sO^ac(v_Toa!W+ZdE7F@JN+g{%L%^~s#l3c5KxnDT1O zRiSSAz?U=H5C{+WWnexO6C`G2 z<0x&=TI==PP-^$-&J6?8I>t9ql}b4F>2Up z>zljclk)bXDX+%-^hbv~?NVj{%UpYB+YVs9W`f8Z_T?8|j&3!EHiBag5xaT0*V;|M zEY;C!P~qI1!7&Yop!@v`4#7^lL`QigbK9C!AtrnD5^gNrJg)@Ds9$$lZt4jq?O^&G(Pu(jS?JWRiwGnX=bEYb@-u3VJvW}#YlvgtEk4qI+ zN3X}%@Rk`x16ZbmPxvfg4rGGJ^iMp7KSpqviE_+D@rzjVO~0oBb9l8(i8+g78qR{2 zxX0~=Zf>tmc_p)+E>-wwop=$t>0g5eu*_FuvUdaXG7}`G;YceyE7J#dT1Pvo$uU2R zR}VVAn!O#E+Rv7Q3+Lu+j%hfjJ+|2795DNQp}ZQ?JYl69F!PxKEYn--i|@b;`bw3F zX*fNYaC~MUFl&CJD93ytUZ>q(dmG+EiQMchG3Rj1jh9Pk!WvBo1Ln9blviWAPI7Dj z%rDFUmf2c6ISjhlXDd}Err~67)m(f0B6gjLa?DimlGo{0cJ-i}Tg{h9%()!X`@9&BO|pVwx{Dx z`R(j?|Wj@fxxMNOr<)$y%~_ZL!Ljd^*?8T?v(ES3ha z%o@9vT><9AMHD3F<82Yz!F@kGghNoh#T4b3UQT-5>y-39z>F+!9#%LvV>xCAv1tw zCjK1P61q8a0aYfZNl1jY#OcOMf%%n*a?IHc^|~R=obW7Yx$||2xrAfh_VCu|j&7d^ z%#$vZS27LVH1-Tihi(SD(g2p(u<>O4(bz{!keJXcVwdfoWt1k4>ZXaLJJxzIHPy4k~yg2Xg* zv!r?7d%(QJL^bh~A`~YUP z%QS#xE{wTU1(-*eATbTy3~AJ%EHE2hp_&}CxX8Rzz9W8adYxP-F_&@7CG9*kl}7i! z2h78#D6eEbUUpqDp1ZU)FsGfS0W7ok?Nyh6X@7=-$ZT?~vOO@*Fj0=_;UNBq&x{?o zY}nTEy2M=0F-N>1Y z4?7Rc#Y~iAUJ#iNibp%Zk=Dc1EWB`Tu25yhzUUeSOu>xuN~WQk%WjUwOI}-;0W8yX z)q7ll`G*N2Gi{H~2AE#vv=JP$xX2t|@(3P+mRWX{m@7Hv=2`7E+XD~$1LhPf%BwLu zjdm`8Za!cJu*_rnS~0NGI$Kj^WG;_8a|)QVnJC9>Ur(mf=%(Z4g%WcW$E&Q5A&1I&BO0G1is%yS!b(`z(UCZ^#? z`*~COjnK`bOq64m5Scz3M~nsLgqCK*3g_k;j#=8;PE+aEFbiM~Z%uh6^TW97Lf3~Q zt%2F74GmzK896UY0yCEh5_6>Z97?Y4bT%*-XsIU0d{|enGfy~O1emizyGqP>Rpyl$ zXK)2(KyS(`nT8{+r*6^~V4h+Iu*_>|*YVlva($>WG4FMV(2nff6i@aBF;R|rQDg?| z>^B4Ryk)G!T&v2gexf7kbE?Q}6Y-)dbaTsi)8U14Gf|b9lF@f5FfUJ_ zyc%=*-Ux5lX@e)y0G3%$zbsx0dc_2ZX_%YsFWfl-%zl%oCdbULsn_*BJO7V-%}y<-GBKM@ zj?lheInfw)+IS|)F)LNk>skj^EdXZDoCOk7uga|T#|M}6Tg;=p8gu%15B#~b-OK=% zsq^^r7?`f}sWLIEuM$rW{Fdzn=0PUPF((_kIiwN3NGGf1HHn$bF{h1k(!`zje-1nC z#a5J8V~$w24S$TlsWlB?nZ`{I{{ZGpCWy=rWq#r79kQ7y$IPp&*ByO1t08o=&*L&9 z3+Lu0Rp#KJ&EtUC>j~x6n9-SmMzGVKWCpOz)(+W&fNA}dDkC#9uxvNzW0?S8!7=;U>UI0(*scQRopRSC=2lf^z}~RIu+x?+PkAM? zq)DpK%x*yrbaOv5fMq(hZ-@(KoGVaeWCrEM;M<1xGEt6M-$t(+G@ compiling dataset index builder ... -[04/17 16:33:31] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.056 seconds -[04/17 16:33:31] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.057 seconds -[04/17 16:33:33] lb.engine.default INFO: Prepare training, validating, testing set -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: reading document index... -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008427 seconds -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[04/17 16:33:33] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[04/17 16:33:33] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[04/17 16:33:33] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[04/17 16:33:33] libai INFO: > Start building model... -[04/17 16:33:33] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[04/17 16:33:33] libai INFO: >>> done with building model. Building time: 0.152 seconds -[04/17 16:33:33] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[04/17 16:33:33] lb.engine.trainer INFO: Starting training from iteration 0 -[04/17 16:33:34] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0014 s/iter lr: 5.00e-05 -[04/17 16:33:35] lb.utils.events INFO: eta: 4 days, 6:13:20 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0008 s/iter lr: 5.00e-05 -[04/17 16:33:36] lb.utils.events INFO: eta: 4 days, 5:16:34 iteration: 2/335008 consumed_samples: 12 total_loss: 9.452 time: 1.0883 s/iter data_time: 0.0010 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 -[04/17 16:33:37] lb.utils.events INFO: eta: 4 days, 5:28:00 iteration: 3/335008 consumed_samples: 16 total_loss: 9.38 time: 1.0904 s/iter data_time: 0.0008 s/iter total_throughput: 3.67 samples/s lr: 5.00e-05 -[04/17 16:33:38] lb.utils.events INFO: eta: 4 days, 5:39:27 iteration: 4/335008 consumed_samples: 20 total_loss: 9.309 time: 1.0931 s/iter data_time: 0.0008 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:33:39] lb.utils.events INFO: eta: 4 days, 5:31:22 iteration: 5/335008 consumed_samples: 24 total_loss: 9.213 time: 1.0922 s/iter data_time: 0.0007 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:33:40] lb.utils.events INFO: eta: 4 days, 5:39:25 iteration: 6/335008 consumed_samples: 28 total_loss: 9.118 time: 1.0932 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:33:42] lb.utils.events INFO: eta: 4 days, 5:34:34 iteration: 7/335008 consumed_samples: 32 total_loss: 9.097 time: 1.0928 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:33:43] lb.utils.events INFO: eta: 4 days, 5:32:48 iteration: 8/335008 consumed_samples: 36 total_loss: 9.075 time: 1.0926 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:33:44] lb.utils.events INFO: eta: 4 days, 5:36:04 iteration: 9/335008 consumed_samples: 40 total_loss: 8.992 time: 1.0927 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:33:45] lb.utils.events INFO: eta: 4 days, 5:39:20 iteration: 10/335008 consumed_samples: 44 total_loss: 8.908 time: 1.0927 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:34:17] libai INFO: Rank of current process: 0. World size: 1 -[04/17 16:34:17] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[04/17 16:34:17] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=False), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[04/17 16:34:17] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[04/17 16:34:17] lb.engine.default INFO: > compiling dataset index builder ... -[04/17 16:34:17] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.054 seconds -[04/17 16:34:17] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.055 seconds -[04/17 16:34:18] lb.engine.default INFO: Prepare training, validating, testing set -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: reading document index... -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007285 seconds -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[04/17 16:34:18] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[04/17 16:34:18] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[04/17 16:34:18] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[04/17 16:34:18] libai INFO: > Start building model... -[04/17 16:34:18] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[04/17 16:34:18] libai INFO: >>> done with building model. Building time: 0.146 seconds -[04/17 16:34:18] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[04/17 16:34:18] lb.engine.trainer INFO: Starting training from iteration 0 -[04/17 16:34:19] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0011 s/iter lr: 5.00e-05 -[04/17 16:34:20] lb.utils.events INFO: eta: 3 days, 21:22:14 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0007 s/iter lr: 5.00e-05 -[04/17 16:34:21] lb.utils.events INFO: eta: 3 days, 21:19:35 iteration: 2/335008 consumed_samples: 12 total_loss: 9.452 time: 1.0029 s/iter data_time: 0.0008 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:22] lb.utils.events INFO: eta: 3 days, 21:17:20 iteration: 3/335008 consumed_samples: 16 total_loss: 9.38 time: 1.0025 s/iter data_time: 0.0007 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:23] lb.utils.events INFO: eta: 3 days, 21:15:05 iteration: 4/335008 consumed_samples: 20 total_loss: 9.309 time: 1.0005 s/iter data_time: 0.0006 s/iter total_throughput: 4.00 samples/s lr: 5.00e-05 -[04/17 16:34:24] lb.utils.events INFO: eta: 3 days, 21:17:18 iteration: 5/335008 consumed_samples: 24 total_loss: 9.213 time: 1.0028 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:25] lb.utils.events INFO: eta: 3 days, 21:15:03 iteration: 6/335008 consumed_samples: 28 total_loss: 9.118 time: 1.0022 s/iter data_time: 0.0006 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:26] lb.utils.events INFO: eta: 3 days, 21:12:04 iteration: 7/335008 consumed_samples: 32 total_loss: 9.097 time: 1.0020 s/iter data_time: 0.0006 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:27] lb.utils.events INFO: eta: 3 days, 21:11:32 iteration: 8/335008 consumed_samples: 36 total_loss: 9.075 time: 1.0019 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:28] lb.utils.events INFO: eta: 3 days, 21:13:16 iteration: 9/335008 consumed_samples: 40 total_loss: 8.992 time: 1.0021 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:29] lb.utils.events INFO: eta: 3 days, 21:14:59 iteration: 10/335008 consumed_samples: 44 total_loss: 8.908 time: 1.0029 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:31] lb.utils.events INFO: eta: 3 days, 21:17:12 iteration: 11/335008 consumed_samples: 48 total_loss: 8.861 time: 1.0033 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:32] lb.utils.events INFO: eta: 3 days, 21:19:25 iteration: 12/335008 consumed_samples: 52 total_loss: 8.814 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:33] lb.utils.events INFO: eta: 3 days, 21:19:44 iteration: 13/335008 consumed_samples: 56 total_loss: 8.798 time: 1.0037 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:34] lb.utils.events INFO: eta: 3 days, 21:19:35 iteration: 14/335008 consumed_samples: 60 total_loss: 8.781 time: 1.0036 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:35] lb.utils.events INFO: eta: 3 days, 21:19:28 iteration: 15/335008 consumed_samples: 64 total_loss: 8.747 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:36] lb.utils.events INFO: eta: 3 days, 21:19:33 iteration: 16/335008 consumed_samples: 68 total_loss: 8.712 time: 1.0037 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:37] lb.utils.events INFO: eta: 3 days, 21:19:26 iteration: 17/335008 consumed_samples: 72 total_loss: 8.7 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:38] lb.utils.events INFO: eta: 3 days, 21:19:31 iteration: 18/335008 consumed_samples: 76 total_loss: 8.687 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:39] lb.utils.events INFO: eta: 3 days, 21:19:44 iteration: 19/335008 consumed_samples: 80 total_loss: 8.67 time: 1.0036 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:40] lb.utils.events INFO: eta: 3 days, 21:19:29 iteration: 20/335008 consumed_samples: 84 total_loss: 8.653 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:41] lb.utils.events INFO: eta: 3 days, 21:19:42 iteration: 21/335008 consumed_samples: 88 total_loss: 8.572 time: 1.0035 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:42] lb.utils.events INFO: eta: 3 days, 21:19:27 iteration: 22/335008 consumed_samples: 92 total_loss: 8.491 time: 1.0033 s/iter data_time: 0.0004 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:43] lb.utils.events INFO: eta: 3 days, 21:19:20 iteration: 23/335008 consumed_samples: 96 total_loss: 8.49 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:44] lb.utils.events INFO: eta: 3 days, 21:19:13 iteration: 24/335008 consumed_samples: 100 total_loss: 8.489 time: 1.0029 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:45] lb.utils.events INFO: eta: 3 days, 21:19:18 iteration: 25/335008 consumed_samples: 104 total_loss: 8.479 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:46] lb.utils.events INFO: eta: 3 days, 21:19:11 iteration: 26/335008 consumed_samples: 108 total_loss: 8.468 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:47] lb.utils.events INFO: eta: 3 days, 21:19:16 iteration: 27/335008 consumed_samples: 112 total_loss: 8.463 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:48] lb.utils.events INFO: eta: 3 days, 21:19:21 iteration: 28/335008 consumed_samples: 116 total_loss: 8.458 time: 1.0031 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:49] lb.utils.events INFO: eta: 3 days, 21:19:14 iteration: 29/335008 consumed_samples: 120 total_loss: 8.407 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:50] lb.utils.events INFO: eta: 3 days, 21:19:07 iteration: 30/335008 consumed_samples: 124 total_loss: 8.355 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:51] lb.utils.events INFO: eta: 3 days, 21:19:06 iteration: 31/335008 consumed_samples: 128 total_loss: 8.342 time: 1.0030 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:52] lb.utils.events INFO: eta: 3 days, 21:19:04 iteration: 32/335008 consumed_samples: 132 total_loss: 8.329 time: 1.0029 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:53] lb.utils.events INFO: eta: 3 days, 21:18:24 iteration: 33/335008 consumed_samples: 136 total_loss: 8.318 time: 1.0029 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:54] lb.utils.events INFO: eta: 3 days, 21:17:44 iteration: 34/335008 consumed_samples: 140 total_loss: 8.307 time: 1.0029 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:55] lb.utils.events INFO: eta: 3 days, 21:17:27 iteration: 35/335008 consumed_samples: 144 total_loss: 8.305 time: 1.0028 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:56] lb.utils.events INFO: eta: 3 days, 21:17:11 iteration: 36/335008 consumed_samples: 148 total_loss: 8.304 time: 1.0027 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:57] lb.utils.events INFO: eta: 3 days, 21:17:25 iteration: 37/335008 consumed_samples: 152 total_loss: 8.215 time: 1.0028 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:58] lb.utils.events INFO: eta: 3 days, 21:17:40 iteration: 38/335008 consumed_samples: 156 total_loss: 8.126 time: 1.0028 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:34:59] lb.utils.events INFO: eta: 3 days, 21:17:23 iteration: 39/335008 consumed_samples: 160 total_loss: 8.111 time: 1.0027 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:00] lb.utils.events INFO: eta: 3 days, 21:17:07 iteration: 40/335008 consumed_samples: 164 total_loss: 8.095 time: 1.0026 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:01] lb.utils.events INFO: eta: 3 days, 21:16:55 iteration: 41/335008 consumed_samples: 168 total_loss: 8.091 time: 1.0026 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:02] lb.utils.events INFO: eta: 3 days, 21:16:43 iteration: 42/335008 consumed_samples: 172 total_loss: 8.087 time: 1.0025 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:03] lb.utils.events INFO: eta: 3 days, 21:16:53 iteration: 43/335008 consumed_samples: 176 total_loss: 8.063 time: 1.0025 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:04] lb.utils.events INFO: eta: 3 days, 21:17:03 iteration: 44/335008 consumed_samples: 180 total_loss: 8.04 time: 1.0025 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:05] lb.utils.events INFO: eta: 3 days, 21:16:51 iteration: 45/335008 consumed_samples: 184 total_loss: 8.01 time: 1.0025 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:06] lb.utils.events INFO: eta: 3 days, 21:16:39 iteration: 46/335008 consumed_samples: 188 total_loss: 7.98 time: 1.0024 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:07] lb.utils.events INFO: eta: 3 days, 21:15:55 iteration: 47/335008 consumed_samples: 192 total_loss: 7.949 time: 1.0023 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:08] lb.utils.events INFO: eta: 3 days, 21:15:12 iteration: 48/335008 consumed_samples: 196 total_loss: 7.918 time: 1.0023 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:09] lb.utils.events INFO: eta: 3 days, 21:15:53 iteration: 49/335008 consumed_samples: 200 total_loss: 7.886 time: 1.0024 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:10] lb.utils.events INFO: eta: 3 days, 21:15:10 iteration: 50/335008 consumed_samples: 204 total_loss: 7.854 time: 1.0021 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:11] lb.utils.events INFO: eta: 3 days, 21:15:51 iteration: 51/335008 consumed_samples: 208 total_loss: 7.851 time: 1.0021 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:12] lb.utils.events INFO: eta: 3 days, 21:15:08 iteration: 52/335008 consumed_samples: 212 total_loss: 7.847 time: 1.0021 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:13] lb.utils.events INFO: eta: 3 days, 21:14:57 iteration: 53/335008 consumed_samples: 216 total_loss: 7.841 time: 1.0020 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:14] lb.utils.events INFO: eta: 3 days, 21:15:06 iteration: 54/335008 consumed_samples: 220 total_loss: 7.834 time: 1.0021 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:15] lb.utils.events INFO: eta: 3 days, 21:14:55 iteration: 55/335008 consumed_samples: 224 total_loss: 7.792 time: 1.0019 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:16] lb.utils.events INFO: eta: 3 days, 21:14:44 iteration: 56/335008 consumed_samples: 228 total_loss: 7.75 time: 1.0019 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:17] lb.utils.events INFO: eta: 3 days, 21:14:53 iteration: 57/335008 consumed_samples: 232 total_loss: 7.705 time: 1.0019 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:18] lb.utils.events INFO: eta: 3 days, 21:15:02 iteration: 58/335008 consumed_samples: 236 total_loss: 7.66 time: 1.0020 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:19] lb.utils.events INFO: eta: 3 days, 21:14:51 iteration: 59/335008 consumed_samples: 240 total_loss: 7.658 time: 1.0019 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:20] lb.utils.events INFO: eta: 3 days, 21:14:40 iteration: 60/335008 consumed_samples: 244 total_loss: 7.657 time: 1.0018 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:21] lb.utils.events INFO: eta: 3 days, 21:14:23 iteration: 61/335008 consumed_samples: 248 total_loss: 7.652 time: 1.0018 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:22] lb.utils.events INFO: eta: 3 days, 21:14:07 iteration: 62/335008 consumed_samples: 252 total_loss: 7.648 time: 1.0018 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:23] lb.utils.events INFO: eta: 3 days, 21:14:06 iteration: 63/335008 consumed_samples: 256 total_loss: 7.628 time: 1.0017 s/iter data_time: 0.0005 s/iter total_throughput: 3.99 samples/s lr: 5.00e-05 -[04/17 16:35:24] lb.utils.events INFO: eta: 3 days, 21:14:05 iteration: 64/335008 consumed_samples: 260 total_loss: 7.608 time: 1.0089 s/iter data_time: 0.0005 s/iter total_throughput: 3.96 samples/s lr: 5.00e-05 -[04/17 16:35:26] lb.utils.events INFO: eta: 3 days, 21:14:19 iteration: 65/335008 consumed_samples: 264 total_loss: 7.562 time: 1.0179 s/iter data_time: 0.0006 s/iter total_throughput: 3.93 samples/s lr: 5.00e-05 -[04/17 16:35:27] lb.utils.events INFO: eta: 3 days, 21:14:34 iteration: 66/335008 consumed_samples: 268 total_loss: 7.517 time: 1.0267 s/iter data_time: 0.0007 s/iter total_throughput: 3.90 samples/s lr: 5.00e-05 -[04/17 16:35:29] lb.utils.events INFO: eta: 3 days, 21:14:43 iteration: 67/335008 consumed_samples: 272 total_loss: 7.516 time: 1.0350 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:35:30] lb.utils.events INFO: eta: 3 days, 21:14:52 iteration: 68/335008 consumed_samples: 276 total_loss: 7.514 time: 1.0429 s/iter data_time: 0.0007 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:35:32] lb.utils.events INFO: eta: 3 days, 21:15:33 iteration: 69/335008 consumed_samples: 280 total_loss: 7.482 time: 1.0508 s/iter data_time: 0.0008 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:35:34] lb.utils.events INFO: eta: 3 days, 21:16:15 iteration: 70/335008 consumed_samples: 284 total_loss: 7.45 time: 1.0586 s/iter data_time: 0.0008 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 -[04/17 16:35:35] lb.utils.events INFO: eta: 3 days, 21:16:25 iteration: 71/335008 consumed_samples: 288 total_loss: 7.446 time: 1.0661 s/iter data_time: 0.0008 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 -[04/17 16:35:37] lb.utils.events INFO: eta: 3 days, 21:16:35 iteration: 72/335008 consumed_samples: 292 total_loss: 7.442 time: 1.0732 s/iter data_time: 0.0008 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 -[04/17 16:35:38] lb.utils.events INFO: eta: 3 days, 21:16:49 iteration: 73/335008 consumed_samples: 296 total_loss: 7.412 time: 1.0803 s/iter data_time: 0.0008 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 -[04/17 16:35:40] lb.utils.events INFO: eta: 3 days, 21:17:04 iteration: 74/335008 consumed_samples: 300 total_loss: 7.382 time: 1.0873 s/iter data_time: 0.0008 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 -[04/17 16:35:42] lb.utils.events INFO: eta: 3 days, 21:17:15 iteration: 75/335008 consumed_samples: 304 total_loss: 7.382 time: 1.0939 s/iter data_time: 0.0008 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:35:43] lb.utils.events INFO: eta: 3 days, 21:17:26 iteration: 76/335008 consumed_samples: 308 total_loss: 7.382 time: 1.1003 s/iter data_time: 0.0008 s/iter total_throughput: 3.64 samples/s lr: 5.00e-05 -[04/17 16:35:45] lb.utils.events INFO: eta: 3 days, 21:17:52 iteration: 77/335008 consumed_samples: 312 total_loss: 7.389 time: 1.1066 s/iter data_time: 0.0008 s/iter total_throughput: 3.61 samples/s lr: 5.00e-05 -[04/17 16:35:46] lb.utils.events INFO: eta: 3 days, 21:18:18 iteration: 78/335008 consumed_samples: 316 total_loss: 7.396 time: 1.1124 s/iter data_time: 0.0009 s/iter total_throughput: 3.60 samples/s lr: 5.00e-05 -[04/17 16:35:48] lb.utils.events INFO: eta: 3 days, 21:18:17 iteration: 79/335008 consumed_samples: 320 total_loss: 7.419 time: 1.1183 s/iter data_time: 0.0009 s/iter total_throughput: 3.58 samples/s lr: 5.00e-05 -[04/17 16:35:49] lb.utils.events INFO: eta: 3 days, 21:18:17 iteration: 80/335008 consumed_samples: 324 total_loss: 7.442 time: 1.1241 s/iter data_time: 0.0009 s/iter total_throughput: 3.56 samples/s lr: 5.00e-05 -[04/17 16:35:52] lb.utils.events INFO: eta: 3 days, 21:18:22 iteration: 81/335008 consumed_samples: 328 total_loss: 7.446 time: 1.1407 s/iter data_time: 0.0009 s/iter total_throughput: 3.51 samples/s lr: 5.00e-05 -[04/17 16:35:53] lb.utils.events INFO: eta: 3 days, 21:18:15 iteration: 82/335008 consumed_samples: 332 total_loss: 7.45 time: 1.1390 s/iter data_time: 0.0009 s/iter total_throughput: 3.51 samples/s lr: 5.00e-05 -[04/17 16:35:54] lb.utils.events INFO: eta: 3 days, 21:18:13 iteration: 83/335008 consumed_samples: 336 total_loss: 7.482 time: 1.1373 s/iter data_time: 0.0009 s/iter total_throughput: 3.52 samples/s lr: 5.00e-05 -[04/17 16:35:55] lb.utils.events INFO: eta: 3 days, 21:18:13 iteration: 84/335008 consumed_samples: 340 total_loss: 7.514 time: 1.1357 s/iter data_time: 0.0009 s/iter total_throughput: 3.52 samples/s lr: 5.00e-05 -[04/17 16:35:56] lb.utils.events INFO: eta: 3 days, 21:18:18 iteration: 85/335008 consumed_samples: 344 total_loss: 7.516 time: 1.1342 s/iter data_time: 0.0007 s/iter total_throughput: 3.53 samples/s lr: 5.00e-05 -[04/17 16:35:57] lb.utils.events INFO: eta: 3 days, 21:18:23 iteration: 86/335008 consumed_samples: 348 total_loss: 7.517 time: 1.1326 s/iter data_time: 0.0007 s/iter total_throughput: 3.53 samples/s lr: 5.00e-05 -[04/17 16:35:58] lb.utils.events INFO: eta: 3 days, 21:18:16 iteration: 87/335008 consumed_samples: 352 total_loss: 7.562 time: 1.1311 s/iter data_time: 0.0007 s/iter total_throughput: 3.54 samples/s lr: 5.00e-05 -[04/17 16:35:59] lb.utils.events INFO: eta: 3 days, 21:18:09 iteration: 88/335008 consumed_samples: 356 total_loss: 7.517 time: 1.1296 s/iter data_time: 0.0007 s/iter total_throughput: 3.54 samples/s lr: 5.00e-05 -[04/17 16:36:00] lb.utils.events INFO: eta: 3 days, 21:18:07 iteration: 89/335008 consumed_samples: 360 total_loss: 7.516 time: 1.1281 s/iter data_time: 0.0006 s/iter total_throughput: 3.55 samples/s lr: 5.00e-05 -[04/17 16:36:01] lb.utils.events INFO: eta: 3 days, 21:18:06 iteration: 90/335008 consumed_samples: 364 total_loss: 7.514 time: 1.1266 s/iter data_time: 0.0006 s/iter total_throughput: 3.55 samples/s lr: 5.00e-05 -[04/17 16:36:02] lb.utils.events INFO: eta: 3 days, 21:17:38 iteration: 91/335008 consumed_samples: 368 total_loss: 7.507 time: 1.1252 s/iter data_time: 0.0006 s/iter total_throughput: 3.55 samples/s lr: 5.00e-05 -[04/17 16:36:03] lb.utils.events INFO: eta: 3 days, 21:17:10 iteration: 92/335008 consumed_samples: 372 total_loss: 7.5 time: 1.1238 s/iter data_time: 0.0006 s/iter total_throughput: 3.56 samples/s lr: 5.00e-05 -[04/17 16:36:04] lb.utils.events INFO: eta: 3 days, 21:16:57 iteration: 93/335008 consumed_samples: 376 total_loss: 7.475 time: 1.1225 s/iter data_time: 0.0006 s/iter total_throughput: 3.56 samples/s lr: 5.00e-05 -[04/17 16:36:05] lb.utils.events INFO: eta: 3 days, 21:16:43 iteration: 94/335008 consumed_samples: 380 total_loss: 7.45 time: 1.1212 s/iter data_time: 0.0006 s/iter total_throughput: 3.57 samples/s lr: 5.00e-05 -[04/17 16:36:06] lb.utils.events INFO: eta: 3 days, 21:16:27 iteration: 95/335008 consumed_samples: 384 total_loss: 7.446 time: 1.1198 s/iter data_time: 0.0006 s/iter total_throughput: 3.57 samples/s lr: 5.00e-05 -[04/17 16:36:07] lb.utils.events INFO: eta: 3 days, 21:16:11 iteration: 96/335008 consumed_samples: 388 total_loss: 7.442 time: 1.1185 s/iter data_time: 0.0006 s/iter total_throughput: 3.58 samples/s lr: 5.00e-05 -[04/17 16:36:08] lb.utils.events INFO: eta: 3 days, 21:16:25 iteration: 97/335008 consumed_samples: 392 total_loss: 7.419 time: 1.1174 s/iter data_time: 0.0005 s/iter total_throughput: 3.58 samples/s lr: 5.00e-05 -[04/17 16:36:09] lb.utils.events INFO: eta: 3 days, 21:16:09 iteration: 98/335008 consumed_samples: 396 total_loss: 7.396 time: 1.1161 s/iter data_time: 0.0005 s/iter total_throughput: 3.58 samples/s lr: 5.00e-05 -[04/17 16:36:10] lb.utils.events INFO: eta: 3 days, 21:15:57 iteration: 99/335008 consumed_samples: 400 total_loss: 7.395 time: 1.1149 s/iter data_time: 0.0005 s/iter total_throughput: 3.59 samples/s lr: 5.00e-05 -[04/17 16:36:11] lb.utils.events INFO: eta: 3 days, 21:16:07 iteration: 100/335008 consumed_samples: 404 total_loss: 7.395 time: 1.1138 s/iter data_time: 0.0005 s/iter total_throughput: 3.59 samples/s lr: 5.00e-05 -[04/17 16:36:12] lb.utils.events INFO: eta: 3 days, 21:15:55 iteration: 101/335008 consumed_samples: 408 total_loss: 7.389 time: 1.1126 s/iter data_time: 0.0005 s/iter total_throughput: 3.60 samples/s lr: 5.00e-05 -[04/17 16:36:13] lb.utils.events INFO: eta: 3 days, 21:15:43 iteration: 102/335008 consumed_samples: 412 total_loss: 7.382 time: 1.1115 s/iter data_time: 0.0005 s/iter total_throughput: 3.60 samples/s lr: 5.00e-05 -[04/17 16:36:14] lb.utils.events INFO: eta: 3 days, 21:14:59 iteration: 103/335008 consumed_samples: 416 total_loss: 7.382 time: 1.1104 s/iter data_time: 0.0005 s/iter total_throughput: 3.60 samples/s lr: 5.00e-05 -[04/17 16:36:15] lb.utils.events INFO: eta: 3 days, 21:14:16 iteration: 104/335008 consumed_samples: 420 total_loss: 7.381 time: 1.1093 s/iter data_time: 0.0005 s/iter total_throughput: 3.61 samples/s lr: 5.00e-05 -[04/17 16:36:16] lb.utils.events INFO: eta: 3 days, 21:14:57 iteration: 105/335008 consumed_samples: 424 total_loss: 7.379 time: 1.1083 s/iter data_time: 0.0005 s/iter total_throughput: 3.61 samples/s lr: 5.00e-05 -[04/17 16:36:17] lb.utils.events INFO: eta: 3 days, 21:15:39 iteration: 106/335008 consumed_samples: 428 total_loss: 7.378 time: 1.1073 s/iter data_time: 0.0005 s/iter total_throughput: 3.61 samples/s lr: 5.00e-05 -[04/17 16:36:18] lb.utils.events INFO: eta: 3 days, 21:14:55 iteration: 107/335008 consumed_samples: 432 total_loss: 7.372 time: 1.1063 s/iter data_time: 0.0005 s/iter total_throughput: 3.62 samples/s lr: 5.00e-05 -[04/17 16:36:19] lb.utils.events INFO: eta: 3 days, 21:14:12 iteration: 108/335008 consumed_samples: 436 total_loss: 7.365 time: 1.1053 s/iter data_time: 0.0005 s/iter total_throughput: 3.62 samples/s lr: 5.00e-05 -[04/17 16:36:20] lb.utils.events INFO: eta: 3 days, 21:14:01 iteration: 109/335008 consumed_samples: 440 total_loss: 7.363 time: 1.1043 s/iter data_time: 0.0005 s/iter total_throughput: 3.62 samples/s lr: 5.00e-05 -[04/17 16:36:21] lb.utils.events INFO: eta: 3 days, 21:13:49 iteration: 110/335008 consumed_samples: 444 total_loss: 7.361 time: 1.1033 s/iter data_time: 0.0005 s/iter total_throughput: 3.63 samples/s lr: 5.00e-05 -[04/17 16:36:22] lb.utils.events INFO: eta: 3 days, 21:13:33 iteration: 111/335008 consumed_samples: 448 total_loss: 7.347 time: 1.1023 s/iter data_time: 0.0005 s/iter total_throughput: 3.63 samples/s lr: 5.00e-05 -[04/17 16:36:23] lb.utils.events INFO: eta: 3 days, 21:13:17 iteration: 112/335008 consumed_samples: 452 total_loss: 7.334 time: 1.1014 s/iter data_time: 0.0005 s/iter total_throughput: 3.63 samples/s lr: 5.00e-05 -[04/17 16:36:24] lb.utils.events INFO: eta: 3 days, 21:13:16 iteration: 113/335008 consumed_samples: 456 total_loss: 7.329 time: 1.1005 s/iter data_time: 0.0005 s/iter total_throughput: 3.63 samples/s lr: 5.00e-05 -[04/17 16:36:25] lb.utils.events INFO: eta: 3 days, 21:13:14 iteration: 114/335008 consumed_samples: 460 total_loss: 7.324 time: 1.0996 s/iter data_time: 0.0005 s/iter total_throughput: 3.64 samples/s lr: 5.00e-05 -[04/17 16:36:26] lb.utils.events INFO: eta: 3 days, 21:12:49 iteration: 115/335008 consumed_samples: 464 total_loss: 7.307 time: 1.0986 s/iter data_time: 0.0005 s/iter total_throughput: 3.64 samples/s lr: 5.00e-05 -[04/17 16:36:27] lb.utils.events INFO: eta: 3 days, 21:12:23 iteration: 116/335008 consumed_samples: 468 total_loss: 7.29 time: 1.0978 s/iter data_time: 0.0005 s/iter total_throughput: 3.64 samples/s lr: 5.00e-05 -[04/17 16:36:28] lb.utils.events INFO: eta: 3 days, 21:11:35 iteration: 117/335008 consumed_samples: 472 total_loss: 7.288 time: 1.0969 s/iter data_time: 0.0005 s/iter total_throughput: 3.65 samples/s lr: 5.00e-05 -[04/17 16:36:29] lb.utils.events INFO: eta: 3 days, 21:10:47 iteration: 118/335008 consumed_samples: 476 total_loss: 7.286 time: 1.0960 s/iter data_time: 0.0005 s/iter total_throughput: 3.65 samples/s lr: 5.00e-05 -[04/17 16:36:30] lb.utils.events INFO: eta: 3 days, 21:10:14 iteration: 119/335008 consumed_samples: 480 total_loss: 7.242 time: 1.0951 s/iter data_time: 0.0006 s/iter total_throughput: 3.65 samples/s lr: 5.00e-05 -[04/17 16:36:31] lb.utils.events INFO: eta: 3 days, 21:09:40 iteration: 120/335008 consumed_samples: 484 total_loss: 7.199 time: 1.0942 s/iter data_time: 0.0006 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:36:32] lb.utils.events INFO: eta: 3 days, 21:09:12 iteration: 121/335008 consumed_samples: 488 total_loss: 7.197 time: 1.0934 s/iter data_time: 0.0007 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:36:33] lb.utils.events INFO: eta: 3 days, 21:08:45 iteration: 122/335008 consumed_samples: 492 total_loss: 7.195 time: 1.0926 s/iter data_time: 0.0008 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:36:34] lb.utils.events INFO: eta: 3 days, 21:09:10 iteration: 123/335008 consumed_samples: 496 total_loss: 7.185 time: 1.0918 s/iter data_time: 0.0008 s/iter total_throughput: 3.66 samples/s lr: 5.00e-05 -[04/17 16:36:35] lb.utils.events INFO: eta: 3 days, 21:08:43 iteration: 124/335008 consumed_samples: 500 total_loss: 7.175 time: 1.0910 s/iter data_time: 0.0008 s/iter total_throughput: 3.67 samples/s lr: 5.00e-05 -[04/17 16:36:36] lb.utils.events INFO: eta: 3 days, 21:08:16 iteration: 125/335008 consumed_samples: 504 total_loss: 7.169 time: 1.0902 s/iter data_time: 0.0008 s/iter total_throughput: 3.67 samples/s lr: 5.00e-05 -[04/17 16:36:37] lb.utils.events INFO: eta: 3 days, 21:07:48 iteration: 126/335008 consumed_samples: 508 total_loss: 7.163 time: 1.0894 s/iter data_time: 0.0008 s/iter total_throughput: 3.67 samples/s lr: 5.00e-05 -[04/17 16:36:38] lb.utils.events INFO: eta: 3 days, 21:07:45 iteration: 127/335008 consumed_samples: 512 total_loss: 7.158 time: 1.0887 s/iter data_time: 0.0008 s/iter total_throughput: 3.67 samples/s lr: 5.00e-05 -[04/17 16:36:39] lb.utils.events INFO: eta: 3 days, 21:07:41 iteration: 128/335008 consumed_samples: 516 total_loss: 7.153 time: 1.0880 s/iter data_time: 0.0008 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 -[04/17 16:36:40] lb.utils.events INFO: eta: 3 days, 21:07:30 iteration: 129/335008 consumed_samples: 520 total_loss: 7.144 time: 1.0873 s/iter data_time: 0.0008 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 -[04/17 16:36:41] lb.utils.events INFO: eta: 3 days, 21:07:20 iteration: 130/335008 consumed_samples: 524 total_loss: 7.136 time: 1.0866 s/iter data_time: 0.0008 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 -[04/17 16:36:42] lb.utils.events INFO: eta: 3 days, 21:07:17 iteration: 131/335008 consumed_samples: 528 total_loss: 7.134 time: 1.0859 s/iter data_time: 0.0008 s/iter total_throughput: 3.68 samples/s lr: 5.00e-05 -[04/17 16:36:43] lb.utils.events INFO: eta: 3 days, 21:07:13 iteration: 132/335008 consumed_samples: 532 total_loss: 7.133 time: 1.0851 s/iter data_time: 0.0009 s/iter total_throughput: 3.69 samples/s lr: 5.00e-05 -[04/17 16:36:44] lb.utils.events INFO: eta: 3 days, 21:07:06 iteration: 133/335008 consumed_samples: 536 total_loss: 7.125 time: 1.0845 s/iter data_time: 0.0009 s/iter total_throughput: 3.69 samples/s lr: 5.00e-05 -[04/17 16:36:45] lb.utils.events INFO: eta: 3 days, 21:06:58 iteration: 134/335008 consumed_samples: 540 total_loss: 7.118 time: 1.0838 s/iter data_time: 0.0009 s/iter total_throughput: 3.69 samples/s lr: 5.00e-05 -[04/17 16:36:46] lb.utils.events INFO: eta: 3 days, 21:06:53 iteration: 135/335008 consumed_samples: 544 total_loss: 7.109 time: 1.0832 s/iter data_time: 0.0010 s/iter total_throughput: 3.69 samples/s lr: 5.00e-05 -[04/17 16:36:47] lb.utils.events INFO: eta: 3 days, 21:06:49 iteration: 136/335008 consumed_samples: 548 total_loss: 7.101 time: 1.0825 s/iter data_time: 0.0010 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 -[04/17 16:36:48] lb.utils.events INFO: eta: 3 days, 21:06:47 iteration: 137/335008 consumed_samples: 552 total_loss: 7.086 time: 1.0818 s/iter data_time: 0.0010 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 -[04/17 16:36:49] lb.utils.events INFO: eta: 3 days, 21:06:45 iteration: 138/335008 consumed_samples: 556 total_loss: 7.071 time: 1.0812 s/iter data_time: 0.0010 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 -[04/17 16:36:50] lb.utils.events INFO: eta: 3 days, 21:06:39 iteration: 139/335008 consumed_samples: 560 total_loss: 7.067 time: 1.0806 s/iter data_time: 0.0009 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 -[04/17 16:36:51] lb.utils.events INFO: eta: 3 days, 21:06:34 iteration: 140/335008 consumed_samples: 564 total_loss: 7.062 time: 1.0799 s/iter data_time: 0.0009 s/iter total_throughput: 3.70 samples/s lr: 5.00e-05 -[04/17 16:36:52] lb.utils.events INFO: eta: 3 days, 21:06:31 iteration: 141/335008 consumed_samples: 568 total_loss: 7.055 time: 1.0793 s/iter data_time: 0.0008 s/iter total_throughput: 3.71 samples/s lr: 5.00e-05 -[04/17 16:36:53] lb.utils.events INFO: eta: 3 days, 21:06:28 iteration: 142/335008 consumed_samples: 572 total_loss: 7.048 time: 1.0788 s/iter data_time: 0.0008 s/iter total_throughput: 3.71 samples/s lr: 5.00e-05 -[04/17 16:36:54] lb.utils.events INFO: eta: 3 days, 21:06:14 iteration: 143/335008 consumed_samples: 576 total_loss: 7.029 time: 1.0781 s/iter data_time: 0.0008 s/iter total_throughput: 3.71 samples/s lr: 5.00e-05 -[04/17 16:36:55] lb.utils.events INFO: eta: 3 days, 21:06:00 iteration: 144/335008 consumed_samples: 580 total_loss: 7.01 time: 1.0776 s/iter data_time: 0.0008 s/iter total_throughput: 3.71 samples/s lr: 5.00e-05 -[04/17 16:36:56] lb.utils.events INFO: eta: 3 days, 21:05:52 iteration: 145/335008 consumed_samples: 584 total_loss: 7.003 time: 1.0770 s/iter data_time: 0.0009 s/iter total_throughput: 3.71 samples/s lr: 5.00e-05 -[04/17 16:36:57] lb.utils.events INFO: eta: 3 days, 21:05:43 iteration: 146/335008 consumed_samples: 588 total_loss: 6.996 time: 1.0765 s/iter data_time: 0.0010 s/iter total_throughput: 3.72 samples/s lr: 5.00e-05 -[04/17 16:36:58] lb.utils.events INFO: eta: 3 days, 21:05:40 iteration: 147/335008 consumed_samples: 592 total_loss: 6.99 time: 1.0759 s/iter data_time: 0.0010 s/iter total_throughput: 3.72 samples/s lr: 5.00e-05 -[04/17 16:36:59] lb.utils.events INFO: eta: 3 days, 21:05:41 iteration: 148/335008 consumed_samples: 596 total_loss: 6.983 time: 1.0754 s/iter data_time: 0.0010 s/iter total_throughput: 3.72 samples/s lr: 5.00e-05 -[04/17 16:37:00] lb.utils.events INFO: eta: 3 days, 21:05:38 iteration: 149/335008 consumed_samples: 600 total_loss: 6.967 time: 1.0749 s/iter data_time: 0.0012 s/iter total_throughput: 3.72 samples/s lr: 5.00e-05 -[04/17 16:37:01] lb.utils.events INFO: eta: 3 days, 21:05:35 iteration: 150/335008 consumed_samples: 604 total_loss: 6.951 time: 1.0743 s/iter data_time: 0.0012 s/iter total_throughput: 3.72 samples/s lr: 5.00e-05 -[04/17 16:37:02] lb.utils.events INFO: eta: 3 days, 21:05:27 iteration: 151/335008 consumed_samples: 608 total_loss: 6.947 time: 1.0738 s/iter data_time: 0.0012 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 -[04/17 16:37:03] lb.utils.events INFO: eta: 3 days, 21:05:20 iteration: 152/335008 consumed_samples: 612 total_loss: 6.942 time: 1.0733 s/iter data_time: 0.0010 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 -[04/17 16:37:04] lb.utils.events INFO: eta: 3 days, 21:05:00 iteration: 153/335008 consumed_samples: 616 total_loss: 6.924 time: 1.0727 s/iter data_time: 0.0010 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 -[04/17 16:37:05] lb.utils.events INFO: eta: 3 days, 21:04:40 iteration: 154/335008 consumed_samples: 620 total_loss: 6.905 time: 1.0722 s/iter data_time: 0.0010 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 -[04/17 16:37:06] lb.utils.events INFO: eta: 3 days, 21:04:20 iteration: 155/335008 consumed_samples: 624 total_loss: 6.902 time: 1.0717 s/iter data_time: 0.0010 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 -[04/17 16:37:07] lb.utils.events INFO: eta: 3 days, 21:04:00 iteration: 156/335008 consumed_samples: 628 total_loss: 6.899 time: 1.0712 s/iter data_time: 0.0009 s/iter total_throughput: 3.73 samples/s lr: 5.00e-05 -[04/17 16:37:08] lb.utils.events INFO: eta: 3 days, 21:04:18 iteration: 157/335008 consumed_samples: 632 total_loss: 6.897 time: 1.0708 s/iter data_time: 0.0009 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 -[04/17 16:37:09] lb.utils.events INFO: eta: 3 days, 21:03:58 iteration: 158/335008 consumed_samples: 636 total_loss: 6.895 time: 1.0703 s/iter data_time: 0.0009 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 -[04/17 16:37:10] lb.utils.events INFO: eta: 3 days, 21:03:54 iteration: 159/335008 consumed_samples: 640 total_loss: 6.886 time: 1.0698 s/iter data_time: 0.0010 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 -[04/17 16:37:11] lb.utils.events INFO: eta: 3 days, 21:03:49 iteration: 160/335008 consumed_samples: 644 total_loss: 6.877 time: 1.0694 s/iter data_time: 0.0011 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 -[04/17 16:37:12] lb.utils.events INFO: eta: 3 days, 21:03:40 iteration: 161/335008 consumed_samples: 648 total_loss: 6.848 time: 1.0689 s/iter data_time: 0.0011 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 -[04/17 16:37:13] lb.utils.events INFO: eta: 3 days, 21:03:30 iteration: 162/335008 consumed_samples: 652 total_loss: 6.82 time: 1.0685 s/iter data_time: 0.0012 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 -[04/17 16:37:14] lb.utils.events INFO: eta: 3 days, 21:03:38 iteration: 163/335008 consumed_samples: 656 total_loss: 6.817 time: 1.0681 s/iter data_time: 0.0012 s/iter total_throughput: 3.74 samples/s lr: 5.00e-05 -[04/17 16:37:15] lb.utils.events INFO: eta: 3 days, 21:03:28 iteration: 164/335008 consumed_samples: 660 total_loss: 6.815 time: 1.0676 s/iter data_time: 0.0012 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 -[04/17 16:37:16] lb.utils.events INFO: eta: 3 days, 21:03:25 iteration: 165/335008 consumed_samples: 664 total_loss: 6.812 time: 1.0672 s/iter data_time: 0.0011 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 -[04/17 16:37:17] lb.utils.events INFO: eta: 3 days, 21:03:23 iteration: 166/335008 consumed_samples: 668 total_loss: 6.809 time: 1.0667 s/iter data_time: 0.0010 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 -[04/17 16:37:18] lb.utils.events INFO: eta: 3 days, 21:03:23 iteration: 167/335008 consumed_samples: 672 total_loss: 6.809 time: 1.0664 s/iter data_time: 0.0010 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 -[04/17 16:37:19] lb.utils.events INFO: eta: 3 days, 21:03:21 iteration: 168/335008 consumed_samples: 676 total_loss: 6.808 time: 1.0659 s/iter data_time: 0.0009 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 -[04/17 16:37:20] lb.utils.events INFO: eta: 3 days, 21:03:10 iteration: 169/335008 consumed_samples: 680 total_loss: 6.808 time: 1.0655 s/iter data_time: 0.0008 s/iter total_throughput: 3.75 samples/s lr: 5.00e-05 -[04/17 16:37:21] lb.utils.events INFO: eta: 3 days, 21:02:58 iteration: 170/335008 consumed_samples: 684 total_loss: 6.807 time: 1.0651 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 -[04/17 16:37:22] lb.utils.events INFO: eta: 3 days, 21:03:08 iteration: 171/335008 consumed_samples: 688 total_loss: 6.804 time: 1.0647 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 -[04/17 16:37:23] lb.utils.events INFO: eta: 3 days, 21:02:56 iteration: 172/335008 consumed_samples: 692 total_loss: 6.801 time: 1.0644 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 -[04/17 16:37:24] lb.utils.events INFO: eta: 3 days, 21:02:51 iteration: 173/335008 consumed_samples: 696 total_loss: 6.796 time: 1.0640 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 -[04/17 16:37:25] lb.utils.events INFO: eta: 3 days, 21:02:54 iteration: 174/335008 consumed_samples: 700 total_loss: 6.792 time: 1.0636 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 -[04/17 16:37:26] lb.utils.events INFO: eta: 3 days, 21:03:04 iteration: 175/335008 consumed_samples: 704 total_loss: 6.792 time: 1.0633 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 -[04/17 16:37:27] lb.utils.events INFO: eta: 3 days, 21:03:13 iteration: 176/335008 consumed_samples: 708 total_loss: 6.791 time: 1.0629 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 -[04/17 16:37:28] lb.utils.events INFO: eta: 3 days, 21:03:02 iteration: 177/335008 consumed_samples: 712 total_loss: 6.787 time: 1.0625 s/iter data_time: 0.0008 s/iter total_throughput: 3.76 samples/s lr: 5.00e-05 -[04/17 16:37:29] lb.utils.events INFO: eta: 3 days, 21:02:50 iteration: 178/335008 consumed_samples: 716 total_loss: 6.782 time: 1.0622 s/iter data_time: 0.0008 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 -[04/17 16:37:30] lb.utils.events INFO: eta: 3 days, 21:03:00 iteration: 179/335008 consumed_samples: 720 total_loss: 6.782 time: 1.0618 s/iter data_time: 0.0007 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 -[04/17 16:37:31] lb.utils.events INFO: eta: 3 days, 21:02:48 iteration: 180/335008 consumed_samples: 724 total_loss: 6.781 time: 1.0615 s/iter data_time: 0.0007 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 -[04/17 16:37:32] lb.utils.events INFO: eta: 3 days, 21:02:43 iteration: 181/335008 consumed_samples: 728 total_loss: 6.778 time: 1.0611 s/iter data_time: 0.0006 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 -[04/17 16:37:33] lb.utils.events INFO: eta: 3 days, 21:02:37 iteration: 182/335008 consumed_samples: 732 total_loss: 6.774 time: 1.0608 s/iter data_time: 0.0006 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 -[04/17 16:37:34] lb.utils.events INFO: eta: 3 days, 21:02:26 iteration: 183/335008 consumed_samples: 736 total_loss: 6.771 time: 1.0604 s/iter data_time: 0.0006 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 -[04/17 16:37:35] lb.utils.events INFO: eta: 3 days, 21:02:16 iteration: 184/335008 consumed_samples: 740 total_loss: 6.768 time: 1.0601 s/iter data_time: 0.0005 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 -[04/17 16:37:36] lb.utils.events INFO: eta: 3 days, 21:02:11 iteration: 185/335008 consumed_samples: 744 total_loss: 6.766 time: 1.0597 s/iter data_time: 0.0006 s/iter total_throughput: 3.77 samples/s lr: 5.00e-05 -[04/17 16:37:37] lb.utils.events INFO: eta: 3 days, 21:02:06 iteration: 186/335008 consumed_samples: 748 total_loss: 6.764 time: 1.0593 s/iter data_time: 0.0006 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 -[04/17 16:37:38] lb.utils.events INFO: eta: 3 days, 21:01:57 iteration: 187/335008 consumed_samples: 752 total_loss: 6.762 time: 1.0590 s/iter data_time: 0.0007 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 -[04/17 16:37:39] lb.utils.events INFO: eta: 3 days, 21:01:49 iteration: 188/335008 consumed_samples: 756 total_loss: 6.76 time: 1.0586 s/iter data_time: 0.0007 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 -[04/17 16:37:40] lb.utils.events INFO: eta: 3 days, 21:01:38 iteration: 189/335008 consumed_samples: 760 total_loss: 6.758 time: 1.0583 s/iter data_time: 0.0007 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 -[04/17 16:37:41] lb.utils.events INFO: eta: 3 days, 21:01:27 iteration: 190/335008 consumed_samples: 764 total_loss: 6.755 time: 1.0579 s/iter data_time: 0.0007 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 -[04/17 16:37:42] lb.utils.events INFO: eta: 3 days, 21:01:02 iteration: 191/335008 consumed_samples: 768 total_loss: 6.753 time: 1.0576 s/iter data_time: 0.0007 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 -[04/17 16:37:43] lb.utils.events INFO: eta: 3 days, 21:00:36 iteration: 192/335008 consumed_samples: 772 total_loss: 6.751 time: 1.0573 s/iter data_time: 0.0009 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 -[04/17 16:37:44] lb.utils.events INFO: eta: 3 days, 21:00:26 iteration: 193/335008 consumed_samples: 776 total_loss: 6.746 time: 1.0569 s/iter data_time: 0.0009 s/iter total_throughput: 3.78 samples/s lr: 5.00e-05 -[04/17 16:37:45] lb.utils.events INFO: eta: 3 days, 21:00:16 iteration: 194/335008 consumed_samples: 780 total_loss: 6.741 time: 1.0566 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 -[04/17 16:37:46] lb.utils.events INFO: eta: 3 days, 21:00:09 iteration: 195/335008 consumed_samples: 784 total_loss: 6.738 time: 1.0563 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 -[04/17 16:37:47] lb.utils.events INFO: eta: 3 days, 21:00:02 iteration: 196/335008 consumed_samples: 788 total_loss: 6.735 time: 1.0559 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 -[04/17 16:37:48] lb.utils.events INFO: eta: 3 days, 20:59:56 iteration: 197/335008 consumed_samples: 792 total_loss: 6.73 time: 1.0556 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 -[04/17 16:37:49] lb.utils.events INFO: eta: 3 days, 20:59:50 iteration: 198/335008 consumed_samples: 796 total_loss: 6.726 time: 1.0553 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 -[04/17 16:37:50] lb.utils.events INFO: eta: 3 days, 20:59:48 iteration: 199/335008 consumed_samples: 800 total_loss: 6.724 time: 1.0550 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 -[04/17 16:37:51] lb.utils.events INFO: eta: 3 days, 20:59:47 iteration: 200/335008 consumed_samples: 804 total_loss: 6.719 time: 1.0547 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 -[04/17 16:37:52] lb.utils.events INFO: eta: 3 days, 20:59:34 iteration: 201/335008 consumed_samples: 808 total_loss: 6.716 time: 1.0544 s/iter data_time: 0.0009 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 -[04/17 16:37:53] lb.utils.events INFO: eta: 3 days, 20:59:22 iteration: 202/335008 consumed_samples: 812 total_loss: 6.715 time: 1.0541 s/iter data_time: 0.0010 s/iter total_throughput: 3.79 samples/s lr: 5.00e-05 -[04/17 16:37:54] lb.utils.events INFO: eta: 3 days, 20:58:52 iteration: 203/335008 consumed_samples: 816 total_loss: 6.713 time: 1.0537 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 -[04/17 16:37:55] lb.utils.events INFO: eta: 3 days, 20:58:22 iteration: 204/335008 consumed_samples: 820 total_loss: 6.712 time: 1.0535 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 -[04/17 16:37:56] lb.utils.events INFO: eta: 3 days, 20:58:10 iteration: 205/335008 consumed_samples: 824 total_loss: 6.706 time: 1.0532 s/iter data_time: 0.0011 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 -[04/17 16:37:57] lb.utils.events INFO: eta: 3 days, 20:57:59 iteration: 206/335008 consumed_samples: 828 total_loss: 6.698 time: 1.0529 s/iter data_time: 0.0011 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 -[04/17 16:37:58] lb.utils.events INFO: eta: 3 days, 20:57:25 iteration: 207/335008 consumed_samples: 832 total_loss: 6.695 time: 1.0526 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 -[04/17 16:37:59] lb.utils.events INFO: eta: 3 days, 20:56:52 iteration: 208/335008 consumed_samples: 836 total_loss: 6.692 time: 1.0523 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 -[04/17 16:38:00] lb.utils.events INFO: eta: 3 days, 20:56:57 iteration: 209/335008 consumed_samples: 840 total_loss: 6.687 time: 1.0521 s/iter data_time: 0.0011 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 -[04/17 16:38:01] lb.utils.events INFO: eta: 3 days, 20:56:50 iteration: 210/335008 consumed_samples: 844 total_loss: 6.683 time: 1.0518 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 -[04/17 16:38:02] lb.utils.events INFO: eta: 3 days, 20:56:38 iteration: 211/335008 consumed_samples: 848 total_loss: 6.682 time: 1.0515 s/iter data_time: 0.0010 s/iter total_throughput: 3.80 samples/s lr: 5.00e-05 -[04/17 16:38:03] lb.utils.events INFO: eta: 3 days, 20:56:26 iteration: 212/335008 consumed_samples: 852 total_loss: 6.681 time: 1.0512 s/iter data_time: 0.0009 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:38:04] lb.utils.events INFO: eta: 3 days, 20:55:44 iteration: 213/335008 consumed_samples: 856 total_loss: 6.676 time: 1.0510 s/iter data_time: 0.0009 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:38:05] lb.utils.events INFO: eta: 3 days, 20:55:02 iteration: 214/335008 consumed_samples: 860 total_loss: 6.671 time: 1.0507 s/iter data_time: 0.0009 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:38:06] lb.utils.events INFO: eta: 3 days, 20:54:50 iteration: 215/335008 consumed_samples: 864 total_loss: 6.659 time: 1.0504 s/iter data_time: 0.0009 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:38:07] lb.utils.events INFO: eta: 3 days, 20:54:38 iteration: 216/335008 consumed_samples: 868 total_loss: 6.647 time: 1.0502 s/iter data_time: 0.0010 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:38:08] lb.utils.events INFO: eta: 3 days, 20:54:36 iteration: 217/335008 consumed_samples: 872 total_loss: 6.645 time: 1.0499 s/iter data_time: 0.0010 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:38:09] lb.utils.events INFO: eta: 3 days, 20:54:34 iteration: 218/335008 consumed_samples: 876 total_loss: 6.643 time: 1.0496 s/iter data_time: 0.0011 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:38:10] lb.utils.events INFO: eta: 3 days, 20:54:28 iteration: 219/335008 consumed_samples: 880 total_loss: 6.638 time: 1.0494 s/iter data_time: 0.0012 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:38:11] lb.utils.events INFO: eta: 3 days, 20:54:21 iteration: 220/335008 consumed_samples: 884 total_loss: 6.632 time: 1.0491 s/iter data_time: 0.0012 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:38:12] lb.utils.events INFO: eta: 3 days, 20:54:20 iteration: 221/335008 consumed_samples: 888 total_loss: 6.63 time: 1.0489 s/iter data_time: 0.0012 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:38:13] lb.utils.events INFO: eta: 3 days, 20:54:19 iteration: 222/335008 consumed_samples: 892 total_loss: 6.626 time: 1.0486 s/iter data_time: 0.0012 s/iter total_throughput: 3.81 samples/s lr: 5.00e-05 -[04/17 16:38:14] lb.utils.events INFO: eta: 3 days, 20:54:16 iteration: 223/335008 consumed_samples: 896 total_loss: 6.626 time: 1.0483 s/iter data_time: 0.0012 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 -[04/17 16:38:15] lb.utils.events INFO: eta: 3 days, 20:54:14 iteration: 224/335008 consumed_samples: 900 total_loss: 6.623 time: 1.0481 s/iter data_time: 0.0012 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 -[04/17 16:38:16] lb.utils.events INFO: eta: 3 days, 20:53:55 iteration: 225/335008 consumed_samples: 904 total_loss: 6.618 time: 1.0478 s/iter data_time: 0.0012 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 -[04/17 16:38:17] lb.utils.events INFO: eta: 3 days, 20:53:35 iteration: 226/335008 consumed_samples: 908 total_loss: 6.615 time: 1.0476 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 -[04/17 16:38:18] lb.utils.events INFO: eta: 3 days, 20:53:26 iteration: 227/335008 consumed_samples: 912 total_loss: 6.61 time: 1.0473 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 -[04/17 16:38:19] lb.utils.events INFO: eta: 3 days, 20:53:17 iteration: 228/335008 consumed_samples: 916 total_loss: 6.605 time: 1.0471 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 -[04/17 16:38:20] lb.utils.events INFO: eta: 3 days, 20:52:57 iteration: 229/335008 consumed_samples: 920 total_loss: 6.602 time: 1.0469 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 -[04/17 16:38:21] lb.utils.events INFO: eta: 3 days, 20:52:37 iteration: 230/335008 consumed_samples: 924 total_loss: 6.601 time: 1.0466 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 -[04/17 16:38:22] lb.utils.events INFO: eta: 3 days, 20:52:28 iteration: 231/335008 consumed_samples: 928 total_loss: 6.598 time: 1.0464 s/iter data_time: 0.0011 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 -[04/17 16:38:23] lb.utils.events INFO: eta: 3 days, 20:52:20 iteration: 232/335008 consumed_samples: 932 total_loss: 6.594 time: 1.0462 s/iter data_time: 0.0012 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 -[04/17 16:38:24] lb.utils.events INFO: eta: 3 days, 20:52:07 iteration: 233/335008 consumed_samples: 936 total_loss: 6.594 time: 1.0459 s/iter data_time: 0.0012 s/iter total_throughput: 3.82 samples/s lr: 5.00e-05 -[04/17 16:38:25] lb.utils.events INFO: eta: 3 days, 20:51:54 iteration: 234/335008 consumed_samples: 940 total_loss: 6.59 time: 1.0457 s/iter data_time: 0.0012 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:26] lb.utils.events INFO: eta: 3 days, 20:51:52 iteration: 235/335008 consumed_samples: 944 total_loss: 6.586 time: 1.0455 s/iter data_time: 0.0013 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:27] lb.utils.events INFO: eta: 3 days, 20:51:49 iteration: 236/335008 consumed_samples: 948 total_loss: 6.584 time: 1.0452 s/iter data_time: 0.0013 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:28] lb.utils.events INFO: eta: 3 days, 20:51:31 iteration: 237/335008 consumed_samples: 952 total_loss: 6.581 time: 1.0450 s/iter data_time: 0.0013 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:29] lb.utils.events INFO: eta: 3 days, 20:51:12 iteration: 238/335008 consumed_samples: 956 total_loss: 6.579 time: 1.0448 s/iter data_time: 0.0013 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:30] lb.utils.events INFO: eta: 3 days, 20:50:47 iteration: 239/335008 consumed_samples: 960 total_loss: 6.576 time: 1.0446 s/iter data_time: 0.0011 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:31] lb.utils.events INFO: eta: 3 days, 20:51:10 iteration: 240/335008 consumed_samples: 964 total_loss: 6.572 time: 1.0444 s/iter data_time: 0.0011 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:32] lb.utils.events INFO: eta: 3 days, 20:51:27 iteration: 241/335008 consumed_samples: 968 total_loss: 6.568 time: 1.0442 s/iter data_time: 0.0011 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:33] lb.utils.events INFO: eta: 3 days, 20:51:43 iteration: 242/335008 consumed_samples: 972 total_loss: 6.568 time: 1.0441 s/iter data_time: 0.0011 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:34] lb.utils.events INFO: eta: 3 days, 20:51:44 iteration: 243/335008 consumed_samples: 976 total_loss: 6.567 time: 1.0439 s/iter data_time: 0.0010 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:35] lb.utils.events INFO: eta: 3 days, 20:51:44 iteration: 244/335008 consumed_samples: 980 total_loss: 6.566 time: 1.0437 s/iter data_time: 0.0010 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:36] lb.utils.events INFO: eta: 3 days, 20:51:55 iteration: 245/335008 consumed_samples: 984 total_loss: 6.565 time: 1.0435 s/iter data_time: 0.0009 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:37] lb.utils.events INFO: eta: 3 days, 20:52:06 iteration: 246/335008 consumed_samples: 988 total_loss: 6.565 time: 1.0434 s/iter data_time: 0.0009 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:38] lb.utils.events INFO: eta: 3 days, 20:52:12 iteration: 247/335008 consumed_samples: 992 total_loss: 6.563 time: 1.0432 s/iter data_time: 0.0009 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:39] lb.utils.events INFO: eta: 3 days, 20:52:19 iteration: 248/335008 consumed_samples: 996 total_loss: 6.56 time: 1.0430 s/iter data_time: 0.0009 s/iter total_throughput: 3.83 samples/s lr: 5.00e-05 -[04/17 16:38:40] lb.utils.events INFO: eta: 3 days, 20:52:37 iteration: 249/335008 consumed_samples: 1000 total_loss: 6.556 time: 1.0429 s/iter data_time: 0.0009 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:41] lb.utils.events INFO: eta: 3 days, 20:52:17 iteration: 250/335008 consumed_samples: 1004 total_loss: 6.553 time: 1.0427 s/iter data_time: 0.0008 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:42] lb.utils.events INFO: eta: 3 days, 20:52:08 iteration: 251/335008 consumed_samples: 1008 total_loss: 6.549 time: 1.0425 s/iter data_time: 0.0008 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:43] lb.utils.events INFO: eta: 3 days, 20:52:15 iteration: 252/335008 consumed_samples: 1012 total_loss: 6.546 time: 1.0424 s/iter data_time: 0.0008 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:44] lb.utils.events INFO: eta: 3 days, 20:52:06 iteration: 253/335008 consumed_samples: 1016 total_loss: 6.545 time: 1.0422 s/iter data_time: 0.0008 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:45] lb.utils.events INFO: eta: 3 days, 20:52:13 iteration: 254/335008 consumed_samples: 1020 total_loss: 6.545 time: 1.0420 s/iter data_time: 0.0008 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:46] lb.utils.events INFO: eta: 3 days, 20:52:31 iteration: 255/335008 consumed_samples: 1024 total_loss: 6.545 time: 1.0419 s/iter data_time: 0.0007 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:47] lb.utils.events INFO: eta: 3 days, 20:52:11 iteration: 256/335008 consumed_samples: 1028 total_loss: 6.545 time: 1.0417 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:48] lb.utils.events INFO: eta: 3 days, 20:52:02 iteration: 257/335008 consumed_samples: 1032 total_loss: 6.545 time: 1.0415 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:49] lb.utils.events INFO: eta: 3 days, 20:51:54 iteration: 258/335008 consumed_samples: 1036 total_loss: 6.545 time: 1.0413 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:50] lb.utils.events INFO: eta: 3 days, 20:51:48 iteration: 259/335008 consumed_samples: 1040 total_loss: 6.545 time: 1.0412 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:51] lb.utils.events INFO: eta: 3 days, 20:51:43 iteration: 260/335008 consumed_samples: 1044 total_loss: 6.545 time: 1.0410 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:52] lb.utils.events INFO: eta: 3 days, 20:51:34 iteration: 261/335008 consumed_samples: 1048 total_loss: 6.545 time: 1.0408 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:53] lb.utils.events INFO: eta: 3 days, 20:51:26 iteration: 262/335008 consumed_samples: 1052 total_loss: 6.545 time: 1.0406 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:54] lb.utils.events INFO: eta: 3 days, 20:51:24 iteration: 263/335008 consumed_samples: 1056 total_loss: 6.545 time: 1.0404 s/iter data_time: 0.0006 s/iter total_throughput: 3.84 samples/s lr: 5.00e-05 -[04/17 16:38:55] lb.utils.events INFO: eta: 3 days, 20:51:21 iteration: 264/335008 consumed_samples: 1060 total_loss: 6.545 time: 1.0402 s/iter data_time: 0.0006 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:38:56] lb.utils.events INFO: eta: 3 days, 20:51:03 iteration: 265/335008 consumed_samples: 1064 total_loss: 6.545 time: 1.0400 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:38:57] lb.utils.events INFO: eta: 3 days, 20:50:44 iteration: 266/335008 consumed_samples: 1068 total_loss: 6.545 time: 1.0398 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:38:58] lb.utils.events INFO: eta: 3 days, 20:50:19 iteration: 267/335008 consumed_samples: 1072 total_loss: 6.545 time: 1.0397 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:38:59] lb.utils.events INFO: eta: 3 days, 20:49:54 iteration: 268/335008 consumed_samples: 1076 total_loss: 6.543 time: 1.0395 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:00] lb.utils.events INFO: eta: 3 days, 20:49:51 iteration: 269/335008 consumed_samples: 1080 total_loss: 6.54 time: 1.0393 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:01] lb.utils.events INFO: eta: 3 days, 20:49:47 iteration: 270/335008 consumed_samples: 1084 total_loss: 6.537 time: 1.0391 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:02] lb.utils.events INFO: eta: 3 days, 20:49:40 iteration: 271/335008 consumed_samples: 1088 total_loss: 6.537 time: 1.0390 s/iter data_time: 0.0007 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:03] lb.utils.events INFO: eta: 3 days, 20:49:32 iteration: 272/335008 consumed_samples: 1092 total_loss: 6.535 time: 1.0388 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:04] lb.utils.events INFO: eta: 3 days, 20:49:26 iteration: 273/335008 consumed_samples: 1096 total_loss: 6.533 time: 1.0386 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:05] lb.utils.events INFO: eta: 3 days, 20:49:20 iteration: 274/335008 consumed_samples: 1100 total_loss: 6.533 time: 1.0385 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:06] lb.utils.events INFO: eta: 3 days, 20:49:14 iteration: 275/335008 consumed_samples: 1104 total_loss: 6.531 time: 1.0383 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:07] lb.utils.events INFO: eta: 3 days, 20:49:18 iteration: 276/335008 consumed_samples: 1108 total_loss: 6.529 time: 1.0382 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:08] lb.utils.events INFO: eta: 3 days, 20:49:12 iteration: 277/335008 consumed_samples: 1112 total_loss: 6.528 time: 1.0380 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:09] lb.utils.events INFO: eta: 3 days, 20:49:06 iteration: 278/335008 consumed_samples: 1116 total_loss: 6.528 time: 1.0379 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:10] lb.utils.events INFO: eta: 3 days, 20:49:04 iteration: 279/335008 consumed_samples: 1120 total_loss: 6.526 time: 1.0377 s/iter data_time: 0.0008 s/iter total_throughput: 3.85 samples/s lr: 5.00e-05 -[04/17 16:39:11] lb.utils.events INFO: eta: 3 days, 20:49:04 iteration: 280/335008 consumed_samples: 1124 total_loss: 6.524 time: 1.0376 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:12] lb.utils.events INFO: eta: 3 days, 20:49:02 iteration: 281/335008 consumed_samples: 1128 total_loss: 6.522 time: 1.0374 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:13] lb.utils.events INFO: eta: 3 days, 20:49:02 iteration: 282/335008 consumed_samples: 1132 total_loss: 6.519 time: 1.0373 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:14] lb.utils.events INFO: eta: 3 days, 20:49:00 iteration: 283/335008 consumed_samples: 1136 total_loss: 6.517 time: 1.0372 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:15] lb.utils.events INFO: eta: 3 days, 20:48:59 iteration: 284/335008 consumed_samples: 1140 total_loss: 6.516 time: 1.0370 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:16] lb.utils.events INFO: eta: 3 days, 20:48:31 iteration: 285/335008 consumed_samples: 1144 total_loss: 6.516 time: 1.0368 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:17] lb.utils.events INFO: eta: 3 days, 20:48:02 iteration: 286/335008 consumed_samples: 1148 total_loss: 6.514 time: 1.0367 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:18] lb.utils.events INFO: eta: 3 days, 20:47:58 iteration: 287/335008 consumed_samples: 1152 total_loss: 6.513 time: 1.0365 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:19] lb.utils.events INFO: eta: 3 days, 20:47:54 iteration: 288/335008 consumed_samples: 1156 total_loss: 6.512 time: 1.0364 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:20] lb.utils.events INFO: eta: 3 days, 20:47:41 iteration: 289/335008 consumed_samples: 1160 total_loss: 6.512 time: 1.0362 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:21] lb.utils.events INFO: eta: 3 days, 20:47:28 iteration: 290/335008 consumed_samples: 1164 total_loss: 6.51 time: 1.0361 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:22] lb.utils.events INFO: eta: 3 days, 20:47:39 iteration: 291/335008 consumed_samples: 1168 total_loss: 6.51 time: 1.0360 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:23] lb.utils.events INFO: eta: 3 days, 20:47:26 iteration: 292/335008 consumed_samples: 1172 total_loss: 6.507 time: 1.0358 s/iter data_time: 0.0008 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:24] lb.utils.events INFO: eta: 3 days, 20:47:37 iteration: 293/335008 consumed_samples: 1176 total_loss: 6.504 time: 1.0357 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:25] lb.utils.events INFO: eta: 3 days, 20:47:24 iteration: 294/335008 consumed_samples: 1180 total_loss: 6.502 time: 1.0356 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:26] lb.utils.events INFO: eta: 3 days, 20:46:59 iteration: 295/335008 consumed_samples: 1184 total_loss: 6.498 time: 1.0354 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:27] lb.utils.events INFO: eta: 3 days, 20:46:33 iteration: 296/335008 consumed_samples: 1188 total_loss: 6.496 time: 1.0353 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:28] lb.utils.events INFO: eta: 3 days, 20:46:25 iteration: 297/335008 consumed_samples: 1192 total_loss: 6.495 time: 1.0351 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:29] lb.utils.events INFO: eta: 3 days, 20:46:18 iteration: 298/335008 consumed_samples: 1196 total_loss: 6.494 time: 1.0350 s/iter data_time: 0.0007 s/iter total_throughput: 3.86 samples/s lr: 5.00e-05 -[04/17 16:39:30] lb.utils.events INFO: eta: 3 days, 20:45:47 iteration: 299/335008 consumed_samples: 1200 total_loss: 6.493 time: 1.0349 s/iter data_time: 0.0008 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:31] lb.utils.events INFO: eta: 3 days, 20:45:17 iteration: 300/335008 consumed_samples: 1204 total_loss: 6.491 time: 1.0347 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:32] lb.utils.events INFO: eta: 3 days, 20:45:06 iteration: 301/335008 consumed_samples: 1208 total_loss: 6.488 time: 1.0346 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:33] lb.utils.events INFO: eta: 3 days, 20:44:55 iteration: 302/335008 consumed_samples: 1212 total_loss: 6.488 time: 1.0344 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:34] lb.utils.events INFO: eta: 3 days, 20:44:39 iteration: 303/335008 consumed_samples: 1216 total_loss: 6.486 time: 1.0343 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:35] lb.utils.events INFO: eta: 3 days, 20:44:23 iteration: 304/335008 consumed_samples: 1220 total_loss: 6.483 time: 1.0342 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:36] lb.utils.events INFO: eta: 3 days, 20:44:05 iteration: 305/335008 consumed_samples: 1224 total_loss: 6.483 time: 1.0340 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:37] lb.utils.events INFO: eta: 3 days, 20:43:48 iteration: 306/335008 consumed_samples: 1228 total_loss: 6.483 time: 1.0339 s/iter data_time: 0.0011 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:38] lb.utils.events INFO: eta: 3 days, 20:43:40 iteration: 307/335008 consumed_samples: 1232 total_loss: 6.482 time: 1.0338 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:39] lb.utils.events INFO: eta: 3 days, 20:43:33 iteration: 308/335008 consumed_samples: 1236 total_loss: 6.479 time: 1.0336 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:40] lb.utils.events INFO: eta: 3 days, 20:43:11 iteration: 309/335008 consumed_samples: 1240 total_loss: 6.477 time: 1.0335 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:41] lb.utils.events INFO: eta: 3 days, 20:42:50 iteration: 310/335008 consumed_samples: 1244 total_loss: 6.475 time: 1.0334 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:42] lb.utils.events INFO: eta: 3 days, 20:43:09 iteration: 311/335008 consumed_samples: 1248 total_loss: 6.475 time: 1.0333 s/iter data_time: 0.0010 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:43] lb.utils.events INFO: eta: 3 days, 20:42:48 iteration: 312/335008 consumed_samples: 1252 total_loss: 6.473 time: 1.0332 s/iter data_time: 0.0012 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 -[04/17 16:39:44] lb.utils.events INFO: eta: 3 days, 20:42:24 iteration: 313/335008 consumed_samples: 1256 total_loss: 6.472 time: 1.0330 s/iter data_time: 0.0012 s/iter total_throughput: 3.87 samples/s lr: 5.00e-05 diff --git a/projects/MagicPrompt/oneflow_magicprompt/metrics.json b/projects/MagicPrompt/oneflow_magicprompt/metrics.json deleted file mode 100644 index b8092c8f1..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/metrics.json +++ /dev/null @@ -1,325 +0,0 @@ -{"data_time": 0.0013666469603776932, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} -{"data_time": 0.000827386393211782, "iteration": 1, "lr": 5e-05, "total_loss": 10.084523677825928} -{"data_time": 0.0012000647839158773, "eta_seconds": 364594.4456033595, "iteration": 2, "lr": 5e-05, "time": 1.088325385004282, "total_loss": 9.451910972595215} -{"data_time": 0.0007894893642514944, "eta_seconds": 365280.9437548164, "iteration": 3, "lr": 5e-05, "time": 1.0903778574429452, "total_loss": 9.38024377822876} -{"data_time": 0.0005307169631123543, "eta_seconds": 365967.4378013285, "iteration": 4, "lr": 5e-05, "time": 1.0924303298816085, "total_loss": 9.308576583862305} -{"data_time": 0.0004548154538497329, "eta_seconds": 365482.2507692026, "iteration": 5, "lr": 5e-05, "time": 1.090985279995948, "total_loss": 9.213207244873047} -{"data_time": 0.0005307169631123543, "eta_seconds": 365965.2529406687, "iteration": 6, "lr": 5e-05, "time": 1.0924303298816085, "total_loss": 9.117837905883789} -{"data_time": 0.0004548154538497329, "eta_seconds": 365674.8473155312, "iteration": 7, "lr": 5e-05, "time": 1.0915667084045708, "total_loss": 9.096522331237793} -{"data_time": 0.0005105519667267799, "eta_seconds": 365568.93536252226, "iteration": 8, "lr": 5e-05, "time": 1.091253810795024, "total_loss": 9.075206756591797} -{"data_time": 0.0004516154294833541, "eta_seconds": 365764.9098791953, "iteration": 9, "lr": 5e-05, "time": 1.0918420703383163, "total_loss": 8.991514682769775} -{"data_time": 0.0005105519667267799, "eta_seconds": 365960.8832193492, "iteration": 10, "lr": 5e-05, "time": 1.0924303298816085, "total_loss": 8.907822608947754} -{"data_time": 0.001127143856137991, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} -{"data_time": 0.0006743174744769931, "iteration": 1, "lr": 5e-05, "total_loss": 10.084524154663086} -{"data_time": 0.001127143856137991, "eta_seconds": 335975.72989138076, "iteration": 2, "lr": 5e-05, "time": 1.0028976579196751, "total_loss": 9.451911926269531} -{"data_time": 0.0006837879773229361, "eta_seconds": 335840.85186305596, "iteration": 3, "lr": 5e-05, "time": 1.0024980354355648, "total_loss": 9.380244731903076} -{"data_time": 0.00024043209850788116, "eta_seconds": 335705.9746339761, "iteration": 4, "lr": 5e-05, "time": 1.0020984129514545, "total_loss": 9.308577537536621} -{"data_time": 0.0002501021372154355, "eta_seconds": 335838.8468669851, "iteration": 5, "lr": 5e-05, "time": 1.0024980354355648, "total_loss": 9.213211059570312} -{"data_time": 0.00025977217592298985, "eta_seconds": 335703.9704371502, "iteration": 6, "lr": 5e-05, "time": 1.0020984129514545, "total_loss": 9.117844581604004} -{"data_time": 0.0002641481114551425, "eta_seconds": 335524.2486897623, "iteration": 7, "lr": 5e-05, "time": 1.001564921461977, "total_loss": 9.096528053283691} -{"data_time": 0.00026852404698729515, "eta_seconds": 335492.6444796773, "iteration": 8, "lr": 5e-05, "time": 1.0014735700096935, "total_loss": 9.075211524963379} -{"data_time": 0.0003207095433026552, "eta_seconds": 335596.30357400933, "iteration": 9, "lr": 5e-05, "time": 1.001785991480574, "total_loss": 8.991517066955566} -{"data_time": 0.0003728950396180153, "eta_seconds": 335699.9620434984, "iteration": 10, "lr": 5e-05, "time": 1.0020984129514545, "total_loss": 8.907822608947754} -{"data_time": 0.00037450052332133055, "eta_seconds": 335832.8318787725, "iteration": 11, "lr": 5e-05, "time": 1.0024980354355648, "total_loss": 8.860908031463623} -{"data_time": 0.0003761060070246458, "eta_seconds": 335965.70091480156, "iteration": 12, "lr": 5e-05, "time": 1.0028976579196751, "total_loss": 8.813993453979492} -{"data_time": 0.00043159653432667255, "eta_seconds": 335984.5827728254, "iteration": 13, "lr": 5e-05, "time": 1.0029570164624602, "total_loss": 8.797540187835693} -{"data_time": 0.00041356286965310574, "eta_seconds": 335975.9287358192, "iteration": 14, "lr": 5e-05, "time": 1.002934176940471, "total_loss": 8.781086921691895} -{"data_time": 0.00041403493378311396, "eta_seconds": 335968.809011735, "iteration": 15, "lr": 5e-05, "time": 1.002915917430073, "total_loss": 8.74678087234497} -{"data_time": 0.0004145069979131222, "eta_seconds": 335973.9228674653, "iteration": 16, "lr": 5e-05, "time": 1.002934176940471, "total_loss": 8.712474822998047} -{"data_time": 0.00045079702977091074, "eta_seconds": 335966.80317990016, "iteration": 17, "lr": 5e-05, "time": 1.002915917430073, "total_loss": 8.699904918670654} -{"data_time": 0.0004145069979131222, "eta_seconds": 335971.9169991114, "iteration": 18, "lr": 5e-05, "time": 1.002934176940471, "total_loss": 8.687335014343262} -{"data_time": 0.00045079702977091074, "eta_seconds": 335984.6817475958, "iteration": 19, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.670325756072998} -{"data_time": 0.00041403493378311396, "eta_seconds": 335969.91113075754, "iteration": 20, "lr": 5e-05, "time": 1.002934176940471, "total_loss": 8.572019577026367} -{"data_time": 0.00041403493378311396, "eta_seconds": 335982.67579704383, "iteration": 21, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.490070819854736} -{"data_time": 0.00041403493378311396, "eta_seconds": 335967.90526240366, "iteration": 22, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.478554248809814} -{"data_time": 0.0004425774095579982, "eta_seconds": 335960.7856843956, "iteration": 23, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.46279764175415} -{"data_time": 0.00047843088395893574, "eta_seconds": 335953.6661429065, "iteration": 24, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.406514644622803} -{"data_time": 0.0004866505041718483, "eta_seconds": 335958.7798525607, "iteration": 25, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.341817855834961} -{"data_time": 0.0004866505041718483, "eta_seconds": 335951.6603475907, "iteration": 26, "lr": 5e-05, "time": 1.002975275972858, "total_loss": 8.317672729492188} -{"data_time": 0.0004967140266671777, "eta_seconds": 335956.77402072586, "iteration": 27, "lr": 5e-05, "time": 1.0032667339546606, "total_loss": 8.30541181564331} -{"data_time": 0.0004967140266671777, "eta_seconds": 335961.887657342, "iteration": 28, "lr": 5e-05, "time": 1.003421877976507, "total_loss": 8.214956283569336} -{"data_time": 0.0005116895772516727, "eta_seconds": 335954.768188891, "iteration": 29, "lr": 5e-05, "time": 1.003421877976507, "total_loss": 8.110542297363281} -{"data_time": 0.0004967140266671777, "eta_seconds": 335947.648756959, "iteration": 30, "lr": 5e-05, "time": 1.0031304199947044, "total_loss": 8.091084957122803} -{"data_time": 0.0004866505041718483, "eta_seconds": 335946.123815313, "iteration": 31, "lr": 5e-05, "time": 1.0029143589781597, "total_loss": 8.06343412399292} -{"data_time": 0.00047843088395893574, "eta_seconds": 335944.59887678386, "iteration": 32, "lr": 5e-05, "time": 1.0027774545596913, "total_loss": 8.009804964065552} -{"data_time": 0.00047843088395893574, "eta_seconds": 335904.375063678, "iteration": 33, "lr": 5e-05, "time": 1.002614525030367, "total_loss": 7.94881272315979} -{"data_time": 0.00047843088395893574, "eta_seconds": 335864.1514847451, "iteration": 34, "lr": 5e-05, "time": 1.0025354725075886, "total_loss": 7.885829925537109} -{"data_time": 0.00047843088395893574, "eta_seconds": 335847.7926784721, "iteration": 35, "lr": 5e-05, "time": 1.0025354725075886, "total_loss": 7.850605010986328} -{"data_time": 0.0004663855070248246, "eta_seconds": 335831.4339638853, "iteration": 36, "lr": 5e-05, "time": 1.0023454835172743, "total_loss": 7.840587615966797} -{"data_time": 0.0004618855891749263, "eta_seconds": 335845.78744942206, "iteration": 37, "lr": 5e-05, "time": 1.0025354725075886, "total_loss": 7.791827201843262} -{"data_time": 0.0004618855891749263, "eta_seconds": 335860.1408432727, "iteration": 38, "lr": 5e-05, "time": 1.0025354725075886, "total_loss": 7.7047717571258545} -{"data_time": 0.0004618855891749263, "eta_seconds": 335843.782220372, "iteration": 39, "lr": 5e-05, "time": 1.0023454835172743, "total_loss": 7.658276796340942} -{"data_time": 0.0004663855070248246, "eta_seconds": 335827.42368915747, "iteration": 40, "lr": 5e-05, "time": 1.0023454835172743, "total_loss": 7.65231728553772} -{"data_time": 0.00047659792471677065, "eta_seconds": 335815.2970839769, "iteration": 41, "lr": 5e-05, "time": 1.0021419414551929, "total_loss": 7.627636194229126} -{"data_time": 0.00047659792471677065, "eta_seconds": 335803.17054521525, "iteration": 42, "lr": 5e-05, "time": 1.0021419414551929, "total_loss": 7.562399387359619} -{"data_time": 0.0004887214163318276, "eta_seconds": 335813.2920130319, "iteration": 43, "lr": 5e-05, "time": 1.0023454835172743, "total_loss": 7.515562057495117} -{"data_time": 0.0004723356105387211, "eta_seconds": 335823.41341442964, "iteration": 44, "lr": 5e-05, "time": 1.0025354725075886, "total_loss": 7.481845378875732} -{"data_time": 0.0004723356105387211, "eta_seconds": 335811.2869420869, "iteration": 45, "lr": 5e-05, "time": 1.0023454835172743, "total_loss": 7.446049690246582} -{"data_time": 0.0004618855891749263, "eta_seconds": 335799.160536163, "iteration": 46, "lr": 5e-05, "time": 1.0021419414551929, "total_loss": 7.41231107711792} -{"data_time": 0.0004618855891749263, "eta_seconds": 335755.9084938001, "iteration": 47, "lr": 5e-05, "time": 1.0016267889877781, "total_loss": 7.38015341758728} -{"data_time": 0.00046312855556607246, "eta_seconds": 335712.65670370334, "iteration": 48, "lr": 5e-05, "time": 1.0011151174549013, "total_loss": 7.369422197341919} -{"data_time": 0.0004618855891749263, "eta_seconds": 335753.90374154015, "iteration": 49, "lr": 5e-05, "time": 1.0011151174549013, "total_loss": 7.342585563659668} -{"data_time": 0.00046312855556607246, "eta_seconds": 335710.6522037096, "iteration": 50, "lr": 5e-05, "time": 1.0010243909200653, "total_loss": 7.30710506439209} -{"data_time": 0.0004733409732580185, "eta_seconds": 335751.8989892802, "iteration": 51, "lr": 5e-05, "time": 1.0011151174549013, "total_loss": 7.287860631942749} -{"data_time": 0.00048657204024493694, "eta_seconds": 335708.64770371583, "iteration": 52, "lr": 5e-05, "time": 1.0010243909200653, "total_loss": 7.242215871810913} -{"data_time": 0.00048657204024493694, "eta_seconds": 335697.38030274375, "iteration": 53, "lr": 5e-05, "time": 1.0009109794627875, "total_loss": 7.149794578552246} -{"data_time": 0.00048657204024493694, "eta_seconds": 335706.6432037221, "iteration": 54, "lr": 5e-05, "time": 1.0009109794627875, "total_loss": 7.081542015075684} -{"data_time": 0.0005024125566706061, "eta_seconds": 335695.3758640429, "iteration": 55, "lr": 5e-05, "time": 1.000762126990594, "total_loss": 7.035866975784302} -{"data_time": 0.0005024125566706061, "eta_seconds": 335684.1085856566, "iteration": 56, "lr": 5e-05, "time": 1.000762126990594, "total_loss": 6.996468544006348} -{"data_time": 0.0005145600298419595, "eta_seconds": 335693.37142534205, "iteration": 57, "lr": 5e-05, "time": 1.000762126990594, "total_loss": 6.9625513553619385} -{"data_time": 0.0005145600298419595, "eta_seconds": 335702.6342037346, "iteration": 58, "lr": 5e-05, "time": 1.000762126990594, "total_loss": 6.9235382080078125} -{"data_time": 0.0005145600298419595, "eta_seconds": 335691.3669866412, "iteration": 59, "lr": 5e-05, "time": 1.000762126990594, "total_loss": 6.860132455825806} -{"data_time": 0.0005024125566706061, "eta_seconds": 335680.0998308407, "iteration": 60, "lr": 5e-05, "time": 1.0006127986125648, "total_loss": 6.807743787765503} -{"data_time": 0.0005024125566706061, "eta_seconds": 335663.9763332873, "iteration": 61, "lr": 5e-05, "time": 1.0006127986125648, "total_loss": 6.787341117858887} -{"data_time": 0.0005024125566706061, "eta_seconds": 335647.85292602493, "iteration": 62, "lr": 5e-05, "time": 1.0006127986125648, "total_loss": 6.767293930053711} -{"data_time": 0.0004917195765301585, "eta_seconds": 335646.3092202097, "iteration": 63, "lr": 5e-05, "time": 1.000430197105743, "total_loss": 6.71656608581543} -{"data_time": 0.0005035360809415579, "eta_seconds": 335645.84872919903, "iteration": 64, "lr": 5e-05, "time": 1.000430197105743, "total_loss": 6.65900993347168} -{"data_time": 0.0005035360809415579, "eta_seconds": 335659.96775905346, "iteration": 65, "lr": 5e-05, "time": 1.000430197105743, "total_loss": 6.589893341064453} -{"data_time": 0.0005160350119695067, "eta_seconds": 335674.08669861685, "iteration": 66, "lr": 5e-05, "time": 1.0006127986125648, "total_loss": 6.522312879562378} -{"data_time": 0.000525223440490663, "eta_seconds": 335683.3492318378, "iteration": 67, "lr": 5e-05, "time": 1.0009970115497708, "total_loss": 6.522312879562378} -{"data_time": 0.000525223440490663, "eta_seconds": 335692.6117037658, "iteration": 68, "lr": 5e-05, "time": 1.0022391425445676, "total_loss": 6.522312879562378} -{"data_time": 0.0005394439212977886, "eta_seconds": 335733.8562189408, "iteration": 69, "lr": 5e-05, "time": 1.0022391425445676, "total_loss": 6.522312879562378} -{"data_time": 0.0005661874311044812, "eta_seconds": 335775.10048184963, "iteration": 70, "lr": 5e-05, "time": 1.0036480699200183, "total_loss": 6.522312879562378} -{"data_time": 0.0006004689494147897, "eta_seconds": 335785.2210198017, "iteration": 71, "lr": 5e-05, "time": 1.0036480699200183, "total_loss": 6.522312879562378} -{"data_time": 0.0006004689494147897, "eta_seconds": 335795.34149133484, "iteration": 72, "lr": 5e-05, "time": 1.0046572653809562, "total_loss": 6.522312879562378} -{"data_time": 0.0006168085383251309, "eta_seconds": 335809.69332652097, "iteration": 73, "lr": 5e-05, "time": 1.2304696229984984, "total_loss": 6.522312879562378} -{"data_time": 0.0006400905549526215, "eta_seconds": 335824.04507002095, "iteration": 74, "lr": 5e-05, "time": 1.5115476195933297, "total_loss": 6.581220388412476} -{"data_time": 0.0006400905549526215, "eta_seconds": 335835.41077446984, "iteration": 75, "lr": 5e-05, "time": 1.570863317581825, "total_loss": 6.581220388412476} -{"data_time": 0.0006400905549526215, "eta_seconds": 335846.77640506276, "iteration": 76, "lr": 5e-05, "time": 1.5728586190380156, "total_loss": 6.63663649559021} -{"data_time": 0.0006678990321233869, "eta_seconds": 335872.6211466384, "iteration": 77, "lr": 5e-05, "time": 1.5749239114811644, "total_loss": 6.6805243492126465} -{"data_time": 0.0006678990321233869, "eta_seconds": 335898.4657278971, "iteration": 78, "lr": 5e-05, "time": 1.5749239114811644, "total_loss": 6.733211040496826} -{"data_time": 0.0006678990321233869, "eta_seconds": 335897.9848025385, "iteration": 79, "lr": 5e-05, "time": 1.5749239114811644, "total_loss": 6.82474946975708} -{"data_time": 0.0006678990321233869, "eta_seconds": 335897.503874063, "iteration": 80, "lr": 5e-05, "time": 1.5755315409041941, "total_loss": 7.025938987731934} -{"data_time": 0.0006678990321233869, "eta_seconds": 335902.61656118464, "iteration": 81, "lr": 5e-05, "time": 1.576498199487105, "total_loss": 7.157942771911621} -{"data_time": 0.0006745069986209273, "eta_seconds": 335895.4980787472, "iteration": 82, "lr": 5e-05, "time": 1.576498199487105, "total_loss": 7.168901443481445} -{"data_time": 0.0006745069986209273, "eta_seconds": 335893.97321814066, "iteration": 83, "lr": 5e-05, "time": 1.576498199487105, "total_loss": 7.270230054855347} -{"data_time": 0.0006745069986209273, "eta_seconds": 335893.49228343135, "iteration": 84, "lr": 5e-05, "time": 1.576498199487105, "total_loss": 7.373126029968262} -{"data_time": 0.0006678990321233869, "eta_seconds": 335898.6048975149, "iteration": 85, "lr": 5e-05, "time": 1.5755315409041941, "total_loss": 7.3882129192352295} -{"data_time": 0.0006455964175984263, "eta_seconds": 335903.71747507947, "iteration": 86, "lr": 5e-05, "time": 1.5749239114811644, "total_loss": 7.511432647705078} -{"data_time": 0.0006455964175984263, "eta_seconds": 335896.59906568006, "iteration": 87, "lr": 5e-05, "time": 1.5744548824150115, "total_loss": 7.660311460494995} -{"data_time": 0.0006237048655748367, "eta_seconds": 335889.48069279967, "iteration": 88, "lr": 5e-05, "time": 1.5744548824150115, "total_loss": 7.660311460494995} -{"data_time": 0.0006172249559313059, "eta_seconds": 335887.95584154385, "iteration": 89, "lr": 5e-05, "time": 1.5723895899718627, "total_loss": 7.660311460494995} -{"data_time": 0.0006134555442258716, "eta_seconds": 335886.43099340494, "iteration": 90, "lr": 5e-05, "time": 1.5642896714853123, "total_loss": 7.660311460494995} -{"data_time": 0.0006046970374882221, "eta_seconds": 335858.58174528275, "iteration": 91, "lr": 5e-05, "time": 1.281660840031691, "total_loss": 7.660311460494995} -{"data_time": 0.0006046970374882221, "eta_seconds": 335830.7326574775, "iteration": 92, "lr": 5e-05, "time": 1.0057461184915155, "total_loss": 7.660311460494995} -{"data_time": 0.0005906664300709963, "eta_seconds": 335817.3622231402, "iteration": 93, "lr": 5e-05, "time": 1.0044573384802788, "total_loss": 7.660311460494995} -{"data_time": 0.0005754574667662382, "eta_seconds": 335803.9918626589, "iteration": 94, "lr": 5e-05, "time": 1.0022892700508237, "total_loss": 7.660311460494995} -{"data_time": 0.0005505949957296252, "eta_seconds": 335787.6358069703, "iteration": 95, "lr": 5e-05, "time": 1.000986878061667, "total_loss": 7.660311460494995} -{"data_time": 0.0005505949957296252, "eta_seconds": 335771.27984296787, "iteration": 96, "lr": 5e-05, "time": 1.0008598279673606, "total_loss": 7.660311460494995} -{"data_time": 0.0005298228934407234, "eta_seconds": 335785.63057792024, "iteration": 97, "lr": 5e-05, "time": 1.0008598279673606, "total_loss": 7.563601493835449} -{"data_time": 0.0005045279394835234, "eta_seconds": 335769.27470560395, "iteration": 98, "lr": 5e-05, "time": 1.000584620400332, "total_loss": 7.44744873046875} -{"data_time": 0.0005026309518143535, "eta_seconds": 335757.1500265715, "iteration": 99, "lr": 5e-05, "time": 1.0003045349149033, "total_loss": 7.364323377609253} -{"data_time": 0.0005003460682928562, "eta_seconds": 335767.26956824004, "iteration": 100, "lr": 5e-05, "time": 1.0003045349149033, "total_loss": 7.264545679092407} -{"data_time": 0.0004762645112350583, "eta_seconds": 335755.14495562646, "iteration": 101, "lr": 5e-05, "time": 1.0001681599533185, "total_loss": 7.165539264678955} -{"data_time": 0.0004736244445666671, "eta_seconds": 335743.0204094318, "iteration": 102, "lr": 5e-05, "time": 1.000020214007236, "total_loss": 7.1342692375183105} -{"data_time": 0.0004493099404498935, "eta_seconds": 335699.7754305219, "iteration": 103, "lr": 5e-05, "time": 0.9998376205330715, "total_loss": 7.125414848327637} -{"data_time": 0.00042519893031567335, "eta_seconds": 335656.53070387826, "iteration": 104, "lr": 5e-05, "time": 0.9993635170394555, "total_loss": 7.094574451446533} -{"data_time": 0.00042519893031567335, "eta_seconds": 335697.770678262, "iteration": 105, "lr": 5e-05, "time": 0.9993635170394555, "total_loss": 7.059347152709961} -{"data_time": 0.00042519893031567335, "eta_seconds": 335739.01040037954, "iteration": 106, "lr": 5e-05, "time": 0.9993635170394555, "total_loss": 7.021731853485107} -{"data_time": 0.00044809747487306595, "eta_seconds": 335695.76592600206, "iteration": 107, "lr": 5e-05, "time": 0.9995010064449161, "total_loss": 6.9736692905426025} -{"data_time": 0.00044809747487306595, "eta_seconds": 335652.52170389076, "iteration": 108, "lr": 5e-05, "time": 0.9995010064449161, "total_loss": 6.9234254360198975} -{"data_time": 0.00044809747487306595, "eta_seconds": 335641.2560191199, "iteration": 109, "lr": 5e-05, "time": 0.9989435654133558, "total_loss": 6.885960102081299} -{"data_time": 0.00047142442781478167, "eta_seconds": 335629.9903956419, "iteration": 110, "lr": 5e-05, "time": 0.9995010064449161, "total_loss": 6.848095893859863} -{"data_time": 0.00047142442781478167, "eta_seconds": 335613.8691553641, "iteration": 111, "lr": 5e-05, "time": 0.9995010064449161, "total_loss": 6.814356327056885} -{"data_time": 0.0004736244445666671, "eta_seconds": 335597.74800537736, "iteration": 112, "lr": 5e-05, "time": 1.000020214007236, "total_loss": 6.808745384216309} -{"data_time": 0.0004752769600600004, "eta_seconds": 335596.2043804126, "iteration": 113, "lr": 5e-05, "time": 0.9995010064449161, "total_loss": 6.807533502578735} -{"data_time": 0.000477167428471148, "eta_seconds": 335594.6607586818, "iteration": 114, "lr": 5e-05, "time": 0.9993316879263148, "total_loss": 6.798972129821777} -{"data_time": 0.0004972110036760569, "eta_seconds": 335569.0715555465, "iteration": 115, "lr": 5e-05, "time": 0.9993316879263148, "total_loss": 6.786883592605591} -{"data_time": 0.0005177495768293738, "eta_seconds": 335543.4824992472, "iteration": 116, "lr": 5e-05, "time": 0.9998057914199308, "total_loss": 6.781697988510132} -{"data_time": 0.0005292636342346668, "eta_seconds": 335495.63044767827, "iteration": 117, "lr": 5e-05, "time": 0.9993316879263148, "total_loss": 6.774333715438843} -{"data_time": 0.0005292636342346668, "eta_seconds": 335447.77867590333, "iteration": 118, "lr": 5e-05, "time": 0.9993316879263148, "total_loss": 6.765953063964844} -{"data_time": 0.0005363236414268613, "eta_seconds": 335414.12896038033, "iteration": 119, "lr": 5e-05, "time": 0.9993316879263148, "total_loss": 6.7494056224823} -{"data_time": 0.0005416536005213857, "eta_seconds": 335380.4794398362, "iteration": 120, "lr": 5e-05, "time": 0.9980595845263451, "total_loss": 6.7302350997924805} -{"data_time": 0.0005493535427376628, "eta_seconds": 335352.89003759134, "iteration": 121, "lr": 5e-05, "time": 0.9968552130740136, "total_loss": 6.7302350997924805} -{"data_time": 0.000557669554837048, "eta_seconds": 335325.3007941344, "iteration": 122, "lr": 5e-05, "time": 0.9965024429839104, "total_loss": 6.723856210708618} -{"data_time": 0.0005721021443605423, "eta_seconds": 335350.88724923925, "iteration": 123, "lr": 5e-05, "time": 0.996739496011287, "total_loss": 6.718602180480957} -{"data_time": 0.0005721021443605423, "eta_seconds": 335323.29816457024, "iteration": 124, "lr": 5e-05, "time": 0.9956989195197821, "total_loss": 6.713461637496948} -{"data_time": 0.0005721021443605423, "eta_seconds": 335296.1119188424, "iteration": 125, "lr": 5e-05, "time": 0.9948145485250279, "total_loss": 6.711553335189819} -{"data_time": 0.0005721021443605423, "eta_seconds": 335268.92582949763, "iteration": 126, "lr": 5e-05, "time": 0.9940880704671144, "total_loss": 6.700226306915283} -{"data_time": 0.0005721021443605423, "eta_seconds": 335265.0480728783, "iteration": 127, "lr": 5e-05, "time": 0.9940880704671144, "total_loss": 6.685318231582642} -{"data_time": 0.0005721021443605423, "eta_seconds": 335261.1703334388, "iteration": 128, "lr": 5e-05, "time": 0.9940880704671144, "total_loss": 6.685318231582642} -{"data_time": 0.0005721021443605423, "eta_seconds": 335250.8016203104, "iteration": 129, "lr": 5e-05, "time": 0.9944153794785962, "total_loss": 6.682970762252808} -{"data_time": 0.0005493981298059225, "eta_seconds": 335240.4329631282, "iteration": 130, "lr": 5e-05, "time": 0.9944153794785962, "total_loss": 6.68656587600708} -{"data_time": 0.0005584561731666327, "eta_seconds": 335237.18199300393, "iteration": 131, "lr": 5e-05, "time": 0.993885466014035, "total_loss": 6.682970762252808} -{"data_time": 0.0005584561731666327, "eta_seconds": 335233.9310363168, "iteration": 132, "lr": 5e-05, "time": 0.9935553579125553, "total_loss": 6.682970762252808} -{"data_time": 0.0005584561731666327, "eta_seconds": 335226.1645225459, "iteration": 133, "lr": 5e-05, "time": 0.9935553579125553, "total_loss": 6.682970762252808} -{"data_time": 0.0005584561731666327, "eta_seconds": 335218.39804918086, "iteration": 134, "lr": 5e-05, "time": 0.9935553579125553, "total_loss": 6.6753246784210205} -{"data_time": 0.00058128556702286, "eta_seconds": 335213.74423422944, "iteration": 135, "lr": 5e-05, "time": 0.993885466014035, "total_loss": 6.6753246784210205} -{"data_time": 0.0006747349398210645, "eta_seconds": 335209.090441094, "iteration": 136, "lr": 5e-05, "time": 0.9935553579125553, "total_loss": 6.65562105178833} -{"data_time": 0.0005840284284204245, "eta_seconds": 335207.17007183586, "iteration": 137, "lr": 5e-05, "time": 0.9936283569550142, "total_loss": 6.65562105178833} -{"data_time": 0.0005840284284204245, "eta_seconds": 335205.24970806856, "iteration": 138, "lr": 5e-05, "time": 0.9939556659664959, "total_loss": 6.622169494628906} -{"data_time": 0.00058128556702286, "eta_seconds": 335199.6983238603, "iteration": 139, "lr": 5e-05, "time": 0.9946119440719485, "total_loss": 6.598361492156982} -{"data_time": 0.000555802951566875, "eta_seconds": 335194.1469668292, "iteration": 140, "lr": 5e-05, "time": 0.9946119440719485, "total_loss": 6.598361492156982} -{"data_time": 0.0005242860643193126, "eta_seconds": 335191.2503376163, "iteration": 141, "lr": 5e-05, "time": 0.9946545810671523, "total_loss": 6.598361492156982} -{"data_time": 0.0005137396510690451, "eta_seconds": 335188.3537197253, "iteration": 142, "lr": 5e-05, "time": 0.9954509395174682, "total_loss": 6.598361492156982} -{"data_time": 0.0005137396510690451, "eta_seconds": 335174.436885003, "iteration": 143, "lr": 5e-05, "time": 0.9946545810671523, "total_loss": 6.5924601554870605} -{"data_time": 0.0005137396510690451, "eta_seconds": 335160.5201274217, "iteration": 144, "lr": 5e-05, "time": 0.9954509395174682, "total_loss": 6.5924601554870605} -{"data_time": 0.0005242860643193126, "eta_seconds": 335152.2409832876, "iteration": 145, "lr": 5e-05, "time": 0.9954509395174682, "total_loss": 6.574693918228149} -{"data_time": 0.000555802951566875, "eta_seconds": 335143.9618826236, "iteration": 146, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.574693918228149} -{"data_time": 0.0005840284284204245, "eta_seconds": 335140.64378096955, "iteration": 147, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.5924601554870605} -{"data_time": 0.0006746329599991441, "eta_seconds": 335141.9601925977, "iteration": 148, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.5924601554870605} -{"data_time": 0.0007704079616814852, "eta_seconds": 335138.6421047838, "iteration": 149, "lr": 5e-05, "time": 0.9964497874025255, "total_loss": 6.574693918228149} -{"data_time": 0.0007704079616814852, "eta_seconds": 335135.32403081004, "iteration": 150, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.574693918228149} -{"data_time": 0.0007344270125031471, "eta_seconds": 335127.993125611, "iteration": 151, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.574693918228149} -{"data_time": 0.0006719154771417379, "eta_seconds": 335120.6622582197, "iteration": 152, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.555528879165649} -{"data_time": 0.0006618404295295477, "eta_seconds": 335100.5540199692, "iteration": 153, "lr": 5e-05, "time": 0.9954509395174682, "total_loss": 6.5430848598480225} -{"data_time": 0.0006618404295295477, "eta_seconds": 335080.44589584274, "iteration": 154, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.5430848598480225} -{"data_time": 0.0006308654556050897, "eta_seconds": 335060.4153818642, "iteration": 155, "lr": 5e-05, "time": 0.9948097829474136, "total_loss": 6.538717031478882} -{"data_time": 0.0006150599801912904, "eta_seconds": 335040.3849815468, "iteration": 156, "lr": 5e-05, "time": 0.996091929380782, "total_loss": 6.538717031478882} -{"data_time": 0.0006150599801912904, "eta_seconds": 335058.4141370433, "iteration": 157, "lr": 5e-05, "time": 0.9964497874025255, "total_loss": 6.538717031478882} -{"data_time": 0.0006308654556050897, "eta_seconds": 335038.38385038706, "iteration": 158, "lr": 5e-05, "time": 0.996667968458496, "total_loss": 6.538717031478882} -{"data_time": 0.0006618404295295477, "eta_seconds": 335034.1647824049, "iteration": 159, "lr": 5e-05, "time": 0.996667968458496, "total_loss": 6.531677722930908} -{"data_time": 0.00069171201903373, "eta_seconds": 335029.9457336464, "iteration": 160, "lr": 5e-05, "time": 0.9969171308912337, "total_loss": 6.540605068206787} -{"data_time": 0.0007017870666459203, "eta_seconds": 335020.0469476585, "iteration": 161, "lr": 5e-05, "time": 0.9970375073608011, "total_loss": 6.531677722930908} -{"data_time": 0.0007344270125031471, "eta_seconds": 335010.1482148189, "iteration": 162, "lr": 5e-05, "time": 0.9969171308912337, "total_loss": 6.5231032371521} -{"data_time": 0.0008766170358285308, "eta_seconds": 335018.0459080944, "iteration": 163, "lr": 5e-05, "time": 0.9970375073608011, "total_loss": 6.5231032371521} -{"data_time": 0.0009251345181837678, "eta_seconds": 335008.147228403, "iteration": 164, "lr": 5e-05, "time": 0.9969171308912337, "total_loss": 6.5231032371521} -{"data_time": 0.0007547514978796244, "eta_seconds": 335005.9995710214, "iteration": 165, "lr": 5e-05, "time": 0.9969171308912337, "total_loss": 6.503211975097656} -{"data_time": 0.0007061564829200506, "eta_seconds": 335003.8519204918, "iteration": 166, "lr": 5e-05, "time": 0.9969171308912337, "total_loss": 6.503211975097656} -{"data_time": 0.0007017095340415835, "eta_seconds": 335003.9985914575, "iteration": 167, "lr": 5e-05, "time": 0.9970375073608011, "total_loss": 6.503211975097656} -{"data_time": 0.00069171201903373, "eta_seconds": 335001.85094777984, "iteration": 168, "lr": 5e-05, "time": 0.9954168034018949, "total_loss": 6.503211975097656} -{"data_time": 0.0006777989910915494, "eta_seconds": 334990.38944235747, "iteration": 169, "lr": 5e-05, "time": 0.9954168034018949, "total_loss": 6.527876853942871} -{"data_time": 0.0006777989910915494, "eta_seconds": 334978.92799941916, "iteration": 170, "lr": 5e-05, "time": 0.9970375073608011, "total_loss": 6.519261360168457} -{"data_time": 0.0006468240171670914, "eta_seconds": 334988.3885321296, "iteration": 171, "lr": 5e-05, "time": 0.9974556299857795, "total_loss": 6.5367631912231445} -{"data_time": 0.0006377500249072909, "eta_seconds": 334976.92715167534, "iteration": 172, "lr": 5e-05, "time": 0.998238944564946, "total_loss": 6.5454795360565186} -{"data_time": 0.0005667039658874273, "eta_seconds": 334971.06126818527, "iteration": 173, "lr": 5e-05, "time": 0.9987658020108938, "total_loss": 6.5454795360565186} -{"data_time": 0.0005667039658874273, "eta_seconds": 334974.9263039315, "iteration": 174, "lr": 5e-05, "time": 0.9989636850077659, "total_loss": 6.5454795360565186} -{"data_time": 0.0005667039658874273, "eta_seconds": 334984.3867116738, "iteration": 175, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.5454795360565186} -{"data_time": 0.0005272645503282547, "eta_seconds": 334993.84705693205, "iteration": 176, "lr": 5e-05, "time": 0.9995363809866831, "total_loss": 6.5551230907440186} -{"data_time": 0.0005272645503282547, "eta_seconds": 334982.3858014459, "iteration": 177, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.5551230907440186} -{"data_time": 0.0005272645503282547, "eta_seconds": 334970.92460844386, "iteration": 178, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.549628496170044} -{"data_time": 0.0005272645503282547, "eta_seconds": 334980.38489121804, "iteration": 179, "lr": 5e-05, "time": 0.9995363809866831, "total_loss": 6.559171199798584} -{"data_time": 0.000523862661793828, "eta_seconds": 334968.92376070004, "iteration": 180, "lr": 5e-05, "time": 0.9995363809866831, "total_loss": 6.559171199798584} -{"data_time": 0.000523862661793828, "eta_seconds": 334963.05799345765, "iteration": 181, "lr": 5e-05, "time": 0.9995363809866831, "total_loss": 6.559171199798584} -{"data_time": 0.0005238075973466039, "eta_seconds": 334957.1922552772, "iteration": 182, "lr": 5e-05, "time": 0.9995363809866831, "total_loss": 6.559171199798584} -{"data_time": 0.0005238075973466039, "eta_seconds": 334946.69257010054, "iteration": 183, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.559171199798584} -{"data_time": 0.0005238075973466039, "eta_seconds": 334936.1929416659, "iteration": 184, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.549854278564453} -{"data_time": 0.0005270520923659205, "eta_seconds": 334931.1697197673, "iteration": 185, "lr": 5e-05, "time": 0.9990744079696015, "total_loss": 6.549854278564453} -{"data_time": 0.0005270520923659205, "eta_seconds": 334926.1465218987, "iteration": 186, "lr": 5e-05, "time": 0.9987675820011646, "total_loss": 6.545806169509888} -{"data_time": 0.0005272094858810306, "eta_seconds": 334917.94152040733, "iteration": 187, "lr": 5e-05, "time": 0.9984382524853572, "total_loss": 6.5402257442474365} -{"data_time": 0.0005272094858810306, "eta_seconds": 334909.73656195216, "iteration": 188, "lr": 5e-05, "time": 0.9984382524853572, "total_loss": 6.5315093994140625} -{"data_time": 0.0005275964504107833, "eta_seconds": 334898.5958666769, "iteration": 189, "lr": 5e-05, "time": 0.9981814610073343, "total_loss": 6.5315093994140625} -{"data_time": 0.000542824505828321, "eta_seconds": 334887.45523197437, "iteration": 190, "lr": 5e-05, "time": 0.9971339404582977, "total_loss": 6.531692981719971} -{"data_time": 0.000542824505828321, "eta_seconds": 334862.1622792296, "iteration": 191, "lr": 5e-05, "time": 0.9960323289269581, "total_loss": 6.520740032196045} -{"data_time": 0.0005700605688616633, "eta_seconds": 334836.86947159586, "iteration": 192, "lr": 5e-05, "time": 0.9951661893865094, "total_loss": 6.5143210887908936} -{"data_time": 0.0006128224777057767, "eta_seconds": 334826.8279172762, "iteration": 193, "lr": 5e-05, "time": 0.9951661893865094, "total_loss": 6.5220959186553955} -{"data_time": 0.0006128224777057767, "eta_seconds": 334816.78641696554, "iteration": 194, "lr": 5e-05, "time": 0.9945189794525504, "total_loss": 6.531692981719971} -{"data_time": 0.000619517988525331, "eta_seconds": 334809.72645719815, "iteration": 195, "lr": 5e-05, "time": 0.9944486394524574, "total_loss": 6.531692981719971} -{"data_time": 0.0006571760168299079, "eta_seconds": 334802.6665336299, "iteration": 196, "lr": 5e-05, "time": 0.9942038529552519, "total_loss": 6.5220959186553955} -{"data_time": 0.0006571760168299079, "eta_seconds": 334796.45443529007, "iteration": 197, "lr": 5e-05, "time": 0.9934806060045958, "total_loss": 6.496842384338379} -{"data_time": 0.0006424394669011235, "eta_seconds": 334790.2423680851, "iteration": 198, "lr": 5e-05, "time": 0.9935311529552564, "total_loss": 6.485797643661499} -{"data_time": 0.0006424394669011235, "eta_seconds": 334788.8821617225, "iteration": 199, "lr": 5e-05, "time": 0.9930527874967083, "total_loss": 6.476698160171509} -{"data_time": 0.0006619264604523778, "eta_seconds": 334787.521957512, "iteration": 200, "lr": 5e-05, "time": 0.9928158218972385, "total_loss": 6.476698160171509} -{"data_time": 0.0006619264604523778, "eta_seconds": 334774.9385753176, "iteration": 201, "lr": 5e-05, "time": 0.9926744998665527, "total_loss": 6.476698160171509} -{"data_time": 0.0006824899464845657, "eta_seconds": 334762.35526231816, "iteration": 202, "lr": 5e-05, "time": 0.9926744998665527, "total_loss": 6.483594655990601} -{"data_time": 0.0007025444647297263, "eta_seconds": 334732.4790458693, "iteration": 203, "lr": 5e-05, "time": 0.9925029299920425, "total_loss": 6.476698160171509} -{"data_time": 0.0007025444647297263, "eta_seconds": 334702.6030019175, "iteration": 204, "lr": 5e-05, "time": 0.9925029299920425, "total_loss": 6.476698160171509} -{"data_time": 0.0007025444647297263, "eta_seconds": 334690.9468510486, "iteration": 205, "lr": 5e-05, "time": 0.9926744998665527, "total_loss": 6.476698160171509} -{"data_time": 0.0007025444647297263, "eta_seconds": 334679.290763838, "iteration": 206, "lr": 5e-05, "time": 0.9926744998665527, "total_loss": 6.483594655990601} -{"data_time": 0.0006630029529333115, "eta_seconds": 334645.7890883088, "iteration": 207, "lr": 5e-05, "time": 0.9926744998665527, "total_loss": 6.483594655990601} -{"data_time": 0.0006630029529333115, "eta_seconds": 334612.2876069376, "iteration": 208, "lr": 5e-05, "time": 0.9928079464007169, "total_loss": 6.491384506225586} -{"data_time": 0.0007025444647297263, "eta_seconds": 334617.1578535547, "iteration": 209, "lr": 5e-05, "time": 0.992936848429963, "total_loss": 6.491384506225586} -{"data_time": 0.0007025444647297263, "eta_seconds": 334610.28872230765, "iteration": 210, "lr": 5e-05, "time": 0.9927955263992772, "total_loss": 6.483594655990601} -{"data_time": 0.0007025444647297263, "eta_seconds": 334598.24704700103, "iteration": 211, "lr": 5e-05, "time": 0.9927955263992772, "total_loss": 6.482285022735596} -{"data_time": 0.0006753400666639209, "eta_seconds": 334586.20543765835, "iteration": 212, "lr": 5e-05, "time": 0.9927955263992772, "total_loss": 6.473680734634399} -{"data_time": 0.0006651415023952723, "eta_seconds": 334544.1248098032, "iteration": 213, "lr": 5e-05, "time": 0.9927955263992772, "total_loss": 6.472007989883423} -{"data_time": 0.0006651415023952723, "eta_seconds": 334502.0444273602, "iteration": 214, "lr": 5e-05, "time": 0.9927955263992772, "total_loss": 6.472007989883423} -{"data_time": 0.0006651415023952723, "eta_seconds": 334490.33834068105, "iteration": 215, "lr": 5e-05, "time": 0.9930449120001867, "total_loss": 6.471388816833496} -{"data_time": 0.0006651415023952723, "eta_seconds": 334478.63231796375, "iteration": 216, "lr": 5e-05, "time": 0.9932783750118688, "total_loss": 6.471388816833496} -{"data_time": 0.0006753400666639209, "eta_seconds": 334476.746382016, "iteration": 217, "lr": 5e-05, "time": 0.9932783750118688, "total_loss": 6.463591575622559} -{"data_time": 0.0007049249252304435, "eta_seconds": 334474.8604513663, "iteration": 218, "lr": 5e-05, "time": 0.9931024410761893, "total_loss": 6.463591575622559} -{"data_time": 0.0007049249252304435, "eta_seconds": 334468.3186378875, "iteration": 219, "lr": 5e-05, "time": 0.9930178551003337, "total_loss": 6.463591575622559} -{"data_time": 0.0007908979896456003, "eta_seconds": 334461.7768575207, "iteration": 220, "lr": 5e-05, "time": 0.9930178551003337, "total_loss": 6.455276966094971} -{"data_time": 0.0007908979896456003, "eta_seconds": 334460.49928020383, "iteration": 221, "lr": 5e-05, "time": 0.993219965021126, "total_loss": 6.455276966094971} -{"data_time": 0.0007008734392002225, "eta_seconds": 334459.221704551, "iteration": 222, "lr": 5e-05, "time": 0.993219965021126, "total_loss": 6.4539055824279785} -{"data_time": 0.0007008734392002225, "eta_seconds": 334456.71181206405, "iteration": 223, "lr": 5e-05, "time": 0.993219965021126, "total_loss": 6.455276966094971} -{"data_time": 0.0007008734392002225, "eta_seconds": 334454.201928603, "iteration": 224, "lr": 5e-05, "time": 0.993219965021126, "total_loss": 6.462490081787109} -{"data_time": 0.0007008734392002225, "eta_seconds": 334435.0603942699, "iteration": 225, "lr": 5e-05, "time": 0.9930178551003337, "total_loss": 6.471423864364624} -{"data_time": 0.0006712885806336999, "eta_seconds": 334415.9189683208, "iteration": 226, "lr": 5e-05, "time": 0.9930178551003337, "total_loss": 6.462490081787109} -{"data_time": 0.0006874479586258531, "eta_seconds": 334406.67357347906, "iteration": 227, "lr": 5e-05, "time": 0.9926686605904251, "total_loss": 6.462490081787109} -{"data_time": 0.0006874479586258531, "eta_seconds": 334397.4282279024, "iteration": 228, "lr": 5e-05, "time": 0.9926686605904251, "total_loss": 6.462490081787109} -{"data_time": 0.0006712885806336999, "eta_seconds": 334377.3199458297, "iteration": 229, "lr": 5e-05, "time": 0.9923333375481889, "total_loss": 6.4539055824279785} -{"data_time": 0.0006712885806336999, "eta_seconds": 334357.21177791874, "iteration": 230, "lr": 5e-05, "time": 0.9926686605904251, "total_loss": 6.4539055824279785} -{"data_time": 0.0006874479586258531, "eta_seconds": 334348.7153953817, "iteration": 231, "lr": 5e-05, "time": 0.9923333375481889, "total_loss": 6.451754808425903} -{"data_time": 0.0007774725090712309, "eta_seconds": 334340.21905763657, "iteration": 232, "lr": 5e-05, "time": 0.9923333375481889, "total_loss": 6.451754808425903} -{"data_time": 0.0008144070161506534, "eta_seconds": 334327.28044745955, "iteration": 233, "lr": 5e-05, "time": 0.9922797135077417, "total_loss": 6.448373079299927} -{"data_time": 0.0008144070161506534, "eta_seconds": 334314.3419086137, "iteration": 234, "lr": 5e-05, "time": 0.9923333375481889, "total_loss": 6.44198203086853} -{"data_time": 0.0008661691099405289, "eta_seconds": 334312.0323186843, "iteration": 235, "lr": 5e-05, "time": 0.9923333375481889, "total_loss": 6.44198203086853} -{"data_time": 0.0008661691099405289, "eta_seconds": 334309.7227365868, "iteration": 236, "lr": 5e-05, "time": 0.9922797135077417, "total_loss": 6.448373079299927} -{"data_time": 0.0008144070161506534, "eta_seconds": 334291.178135667, "iteration": 237, "lr": 5e-05, "time": 0.9922797135077417, "total_loss": 6.451754808425903} -{"data_time": 0.0008245260687544942, "eta_seconds": 334272.63363957126, "iteration": 238, "lr": 5e-05, "time": 0.9921245854347944, "total_loss": 6.448373079299927} -{"data_time": 0.0007793945260345936, "eta_seconds": 334247.5139494054, "iteration": 239, "lr": 5e-05, "time": 0.9919842553790659, "total_loss": 6.44198203086853} -{"data_time": 0.0007429574616253376, "eta_seconds": 334270.63660499733, "iteration": 240, "lr": 5e-05, "time": 0.9921245854347944, "total_loss": 6.44198203086853} -{"data_time": 0.0007429574616253376, "eta_seconds": 334287.183856871, "iteration": 241, "lr": 5e-05, "time": 0.9921245854347944, "total_loss": 6.435945987701416} -{"data_time": 0.0007429574616253376, "eta_seconds": 334303.73100392055, "iteration": 242, "lr": 5e-05, "time": 0.9921245854347944, "total_loss": 6.435945987701416} -{"data_time": 0.0007060229545459151, "eta_seconds": 334304.0433104681, "iteration": 243, "lr": 5e-05, "time": 0.992783167399466, "total_loss": 6.435945987701416} -{"data_time": 0.0007060229545459151, "eta_seconds": 334304.35560918367, "iteration": 244, "lr": 5e-05, "time": 0.992783167399466, "total_loss": 6.433191299438477} -{"data_time": 0.0006449039792641997, "eta_seconds": 334315.29646015656, "iteration": 245, "lr": 5e-05, "time": 0.9935508704511449, "total_loss": 6.42838454246521} -{"data_time": 0.0006449039792641997, "eta_seconds": 334326.2372397983, "iteration": 246, "lr": 5e-05, "time": 0.9941959194839001, "total_loss": 6.4238059520721436} -{"data_time": 0.0005435205530375242, "eta_seconds": 334332.7358166594, "iteration": 247, "lr": 5e-05, "time": 0.9958192268386483, "total_loss": 6.4238059520721436} -{"data_time": 0.0005435205530375242, "eta_seconds": 334339.23434872855, "iteration": 248, "lr": 5e-05, "time": 0.9958192268386483, "total_loss": 6.419218301773071} -{"data_time": 0.000493954517878592, "eta_seconds": 334357.3438828897, "iteration": 249, "lr": 5e-05, "time": 0.9990725318202749, "total_loss": 6.421024799346924} -{"data_time": 0.0004903025692328811, "eta_seconds": 334337.2368565963, "iteration": 250, "lr": 5e-05, "time": 0.9990725318202749, "total_loss": 6.417817831039429} -{"data_time": 0.0004903025692328811, "eta_seconds": 334328.74092197884, "iteration": 251, "lr": 5e-05, "time": 0.9999000079696998, "total_loss": 6.417817831039429} -{"data_time": 0.00048560311552137136, "eta_seconds": 334335.23936446407, "iteration": 252, "lr": 5e-05, "time": 1.0011805914109573, "total_loss": 6.417817831039429} -{"data_time": 0.00048067199531942606, "eta_seconds": 334326.74347463856, "iteration": 253, "lr": 5e-05, "time": 1.0011805914109573, "total_loss": 6.417817831039429} -{"data_time": 0.00048067199531942606, "eta_seconds": 334333.2418723318, "iteration": 254, "lr": 5e-05, "time": 1.0014799559721723, "total_loss": 6.413652420043945} -{"data_time": 0.00048067199531942606, "eta_seconds": 334351.35106400773, "iteration": 255, "lr": 5e-05, "time": 1.0017993689980358, "total_loss": 6.404175043106079} -{"data_time": 0.00048067199531942606, "eta_seconds": 334331.2443801996, "iteration": 256, "lr": 5e-05, "time": 1.0017993689980358, "total_loss": 6.404175043106079} -{"data_time": 0.00048560311552137136, "eta_seconds": 334322.748579958, "iteration": 257, "lr": 5e-05, "time": 1.0017993689980358, "total_loss": 6.393757343292236} -{"data_time": 0.00048560311552137136, "eta_seconds": 334314.25282450835, "iteration": 258, "lr": 5e-05, "time": 1.0017993689980358, "total_loss": 6.393757343292236} -{"data_time": 0.00048560311552137136, "eta_seconds": 334308.9179540165, "iteration": 259, "lr": 5e-05, "time": 1.0017993689980358, "total_loss": 6.389266014099121} -{"data_time": 0.0004903025692328811, "eta_seconds": 334303.58310943167, "iteration": 260, "lr": 5e-05, "time": 1.0014799559721723, "total_loss": 6.383808374404907} -{"data_time": 0.000493954517878592, "eta_seconds": 334294.9816671086, "iteration": 261, "lr": 5e-05, "time": 1.0011805914109573, "total_loss": 6.383808374404907} -{"data_time": 0.0005103294970467687, "eta_seconds": 334286.3802702096, "iteration": 262, "lr": 5e-05, "time": 0.9999000079696998, "total_loss": 6.383808374404907} -{"data_time": 0.0005292044952511787, "eta_seconds": 334284.0707899276, "iteration": 263, "lr": 5e-05, "time": 0.998466563061811, "total_loss": 6.383808374404907} -{"data_time": 0.0005292044952511787, "eta_seconds": 334281.76131747756, "iteration": 264, "lr": 5e-05, "time": 0.9980369674740359, "total_loss": 6.383808374404907} -{"data_time": 0.0005431510508060455, "eta_seconds": 334263.2181840949, "iteration": 265, "lr": 5e-05, "time": 0.9966902529122308, "total_loss": 6.373529672622681} -{"data_time": 0.0005570600042119622, "eta_seconds": 334244.6751555363, "iteration": 266, "lr": 5e-05, "time": 0.9954996254527941, "total_loss": 6.373529672622681} -{"data_time": 0.0005570600042119622, "eta_seconds": 334219.55748286564, "iteration": 267, "lr": 5e-05, "time": 0.9953462380217388, "total_loss": 6.373529672622681} -{"data_time": 0.0005858434597030282, "eta_seconds": 334194.4399543018, "iteration": 268, "lr": 5e-05, "time": 0.9947797351051122, "total_loss": 6.373529672622681} -{"data_time": 0.0007000388577580452, "eta_seconds": 334191.10474916664, "iteration": 269, "lr": 5e-05, "time": 0.9942065631039441, "total_loss": 6.373529672622681} -{"data_time": 0.0007000388577580452, "eta_seconds": 334187.7695579936, "iteration": 270, "lr": 5e-05, "time": 0.9939530480187386, "total_loss": 6.383808374404907} -{"data_time": 0.0007000388577580452, "eta_seconds": 334180.1698740199, "iteration": 271, "lr": 5e-05, "time": 0.9936071829870343, "total_loss": 6.399683713912964} -{"data_time": 0.000794482883065939, "eta_seconds": 334172.5702294882, "iteration": 272, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.383808374404907} -{"data_time": 0.0008036409271880984, "eta_seconds": 334166.3492401871, "iteration": 273, "lr": 5e-05, "time": 0.9936071829870343, "total_loss": 6.383808374404907} -{"data_time": 0.000794482883065939, "eta_seconds": 334160.1282820909, "iteration": 274, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.401249170303345} -{"data_time": 0.0007486874237656593, "eta_seconds": 334154.13609696925, "iteration": 275, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.424301862716675} -{"data_time": 0.0007550184382125735, "eta_seconds": 334158.1317049486, "iteration": 276, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.414404392242432} -{"data_time": 0.0007550184382125735, "eta_seconds": 334152.13954966515, "iteration": 277, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.424301862716675} -{"data_time": 0.0007534719770774245, "eta_seconds": 334146.1474242199, "iteration": 278, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.424301862716675} -{"data_time": 0.0007136190542951226, "eta_seconds": 334144.9861645689, "iteration": 279, "lr": 5e-05, "time": 0.9928738559829071, "total_loss": 6.424301862716675} -{"data_time": 0.0006624269299209118, "eta_seconds": 334144.15090675396, "iteration": 280, "lr": 5e-05, "time": 0.9931687869830057, "total_loss": 6.424301862716675} -{"data_time": 0.0006144349463284016, "eta_seconds": 334142.98964807694, "iteration": 281, "lr": 5e-05, "time": 0.9930181190138683, "total_loss": 6.420524597167969} -{"data_time": 0.0006144349463284016, "eta_seconds": 334142.15438928804, "iteration": 282, "lr": 5e-05, "time": 0.9932688130065799, "total_loss": 6.414404392242432} -{"data_time": 0.0006144349463284016, "eta_seconds": 334140.99313158495, "iteration": 283, "lr": 5e-05, "time": 0.9932688130065799, "total_loss": 6.4124915599823} -{"data_time": 0.0006227684207260609, "eta_seconds": 334139.8318748558, "iteration": 284, "lr": 5e-05, "time": 0.9934222700539976, "total_loss": 6.414404392242432} -{"data_time": 0.0006227684207260609, "eta_seconds": 334111.2649005472, "iteration": 285, "lr": 5e-05, "time": 0.9930181190138683, "total_loss": 6.414404392242432} -{"data_time": 0.0006676295306533575, "eta_seconds": 334082.6980909647, "iteration": 286, "lr": 5e-05, "time": 0.9930181190138683, "total_loss": 6.414404392242432} -{"data_time": 0.0007136190542951226, "eta_seconds": 334078.7280201912, "iteration": 287, "lr": 5e-05, "time": 0.9934222700539976, "total_loss": 6.4124915599823} -{"data_time": 0.0007136190542951226, "eta_seconds": 334074.7579671757, "iteration": 288, "lr": 5e-05, "time": 0.9934222700539976, "total_loss": 6.397109031677246} -{"data_time": 0.0006954745622351766, "eta_seconds": 334061.7818449775, "iteration": 289, "lr": 5e-05, "time": 0.9934222700539976, "total_loss": 6.397109031677246} -{"data_time": 0.0006954745622351766, "eta_seconds": 334048.80579435034, "iteration": 290, "lr": 5e-05, "time": 0.9940367945237085, "total_loss": 6.3805341720581055} -{"data_time": 0.0006954745622351766, "eta_seconds": 334059.78576599853, "iteration": 291, "lr": 5e-05, "time": 0.9944934344384819, "total_loss": 6.3805341720581055} -{"data_time": 0.0006654334720224142, "eta_seconds": 334046.8097869423, "iteration": 292, "lr": 5e-05, "time": 0.9944934344384819, "total_loss": 6.3805341720581055} -{"data_time": 0.0006654334720224142, "eta_seconds": 334057.78968701954, "iteration": 293, "lr": 5e-05, "time": 0.9944934344384819, "total_loss": 6.3805341720581055} -{"data_time": 0.0006375884404405951, "eta_seconds": 334044.8137795343, "iteration": 294, "lr": 5e-05, "time": 0.9944934344384819, "total_loss": 6.3805341720581055} -{"data_time": 0.0006654334720224142, "eta_seconds": 334019.05948511977, "iteration": 295, "lr": 5e-05, "time": 0.9940367945237085, "total_loss": 6.3805341720581055} -{"data_time": 0.0006654334720224142, "eta_seconds": 333993.30533863115, "iteration": 296, "lr": 5e-05, "time": 0.9940367945237085, "total_loss": 6.376223802566528} -{"data_time": 0.0006990574765950441, "eta_seconds": 333985.6798626669, "iteration": 297, "lr": 5e-05, "time": 0.9939206490525976, "total_loss": 6.376223802566528} -{"data_time": 0.0006654334720224142, "eta_seconds": 333978.05442630476, "iteration": 298, "lr": 5e-05, "time": 0.9935331090819091, "total_loss": 6.361267805099487} -{"data_time": 0.0006990574765950441, "eta_seconds": 333947.68301244406, "iteration": 299, "lr": 5e-05, "time": 0.9935331090819091, "total_loss": 6.357731580734253} -{"data_time": 0.0007225919980555773, "eta_seconds": 333917.3117741011, "iteration": 300, "lr": 5e-05, "time": 0.9932289840653539, "total_loss": 6.357731580734253} -{"data_time": 0.000743917073123157, "eta_seconds": 333906.3849010912, "iteration": 301, "lr": 5e-05, "time": 0.9930623305262998, "total_loss": 6.357731580734253} -{"data_time": 0.0007738874992355704, "eta_seconds": 333895.4580874124, "iteration": 302, "lr": 5e-05, "time": 0.9930623305262998, "total_loss": 6.371462345123291} -{"data_time": 0.0007738874992355704, "eta_seconds": 333879.3042220641, "iteration": 303, "lr": 5e-05, "time": 0.9930623305262998, "total_loss": 6.376223802566528} -{"data_time": 0.0007738874992355704, "eta_seconds": 333863.1504472811, "iteration": 304, "lr": 5e-05, "time": 0.9930623305262998, "total_loss": 6.376223802566528} -{"data_time": 0.0007357789436355233, "eta_seconds": 333845.68046631594, "iteration": 305, "lr": 5e-05, "time": 0.9937273630639538, "total_loss": 6.379072189331055} -{"data_time": 0.0006990574765950441, "eta_seconds": 333828.2105837816, "iteration": 306, "lr": 5e-05, "time": 0.9937273630639538, "total_loss": 6.386901378631592} -{"data_time": 0.0006654334720224142, "eta_seconds": 333820.67230878165, "iteration": 307, "lr": 5e-05, "time": 0.9932270395802334, "total_loss": 6.386901378631592} -{"data_time": 0.0006654334720224142, "eta_seconds": 333813.13407286676, "iteration": 308, "lr": 5e-05, "time": 0.9930623015388846, "total_loss": 6.398268461227417} -{"data_time": 0.0006033414974808693, "eta_seconds": 333791.7527548622, "iteration": 309, "lr": 5e-05, "time": 0.9932270395802334, "total_loss": 6.398268461227417} -{"data_time": 0.0006808298639953136, "eta_seconds": 333770.37155866274, "iteration": 310, "lr": 5e-05, "time": 0.9930623015388846, "total_loss": 6.398268461227417} -{"data_time": 0.0007253768853843212, "eta_seconds": 333789.75817017537, "iteration": 311, "lr": 5e-05, "time": 0.9930623015388846, "total_loss": 6.386901378631592} -{"data_time": 0.000746701960451901, "eta_seconds": 333768.37709578103, "iteration": 312, "lr": 5e-05, "time": 0.9930623015388846, "total_loss": 6.386901378631592} -{"data_time": 0.0007738874992355704, "eta_seconds": 333744.0893874394, "iteration": 313, "lr": 5e-05, "time": 0.9930623015388846, "total_loss": 6.386901378631592} From 010b465a2997b178ca64d1d7a5e9a17b35466f70 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 17 Apr 2023 16:48:01 +0800 Subject: [PATCH 22/25] refine --- projects/MagicPrompt/configs/gpt2_training.py | 6 +- .../oneflow_magicprompt/config.yaml | 77 ++ .../events.out.tfevents.1681721120.oneflow-32 | Bin 0 -> 40 bytes .../events.out.tfevents.1681721150.oneflow-32 | Bin 0 -> 24449 bytes .../MagicPrompt/oneflow_magicprompt/log.txt | 916 ++++++++++++++++++ .../oneflow_magicprompt/metrics.json | 118 +++ 6 files changed, 1113 insertions(+), 4 deletions(-) create mode 100644 projects/MagicPrompt/oneflow_magicprompt/config.yaml create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721120.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721150.oneflow-32 create mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt create mode 100644 projects/MagicPrompt/oneflow_magicprompt/metrics.json diff --git a/projects/MagicPrompt/configs/gpt2_training.py b/projects/MagicPrompt/configs/gpt2_training.py index 7b4eb3c11..f715ee666 100644 --- a/projects/MagicPrompt/configs/gpt2_training.py +++ b/projects/MagicPrompt/configs/gpt2_training.py @@ -40,7 +40,7 @@ model.cfg.scale_mask_softmax_fusion = False model.cfg.apply_query_key_layer_scaling = True model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = False +model.cfg.amp_enabled = True train.input_placement_device = "cpu" @@ -50,8 +50,6 @@ ds.max_seq_length = model.cfg.max_seq_length optim.lr = 5.0e-05 -optim.params.clip_grad_max_norm = None -optim.params.clip_grad_norm_type = None train.update( dict( @@ -61,7 +59,7 @@ train_epoch=33, train_iter=10000, log_period=1, - amp=dict(enabled=False), + amp=dict(enabled=True), warmup_ratio=0, checkpointer=dict(period=8000, max_to_keep=20), dist=dict( diff --git a/projects/MagicPrompt/oneflow_magicprompt/config.yaml b/projects/MagicPrompt/oneflow_magicprompt/config.yaml new file mode 100644 index 000000000..1397e0fc2 --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/config.yaml @@ -0,0 +1,77 @@ +dataloader: + train: + _target_: libai.data.build_nlp_train_val_test_loader + dataset: + - _target_: libai.data.datasets.GPT2Dataset + data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence + indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} + max_seq_length: 1024 + name: gpt-2 + seed: 1234 + num_workers: 4 + splits: + - [949.0, 50.0, 1.0] + train_val_test_num_samples: null + weights: [1.0] +ds: + _target_: libai.data.datasets.GPT2Dataset + data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence + indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} + max_seq_length: 1024 + name: gpt-2 + seed: 1234 +graph: + auto_parallel: {enable_auto_parallel_ignore_user_sbp_config: false, enabled: false, sbp_collector: false, trunk_algo: true} + debug: 2 + enabled: false + eval_graph: {_target_: libai.models.utils.GraphBase, is_train: false} + global_mode: {enabled: false} + train_graph: {_target_: libai.models.utils.GraphBase, is_train: true} +model: + _target_: projects.MagicPrompt.gpt2.GPTForPreTraining + cfg: {amp_enabled: true, apply_query_key_layer_scaling: true, apply_residual_post_layernorm: false, attention_dropout_prob: 0.1, bias_dropout_fusion: false, bias_gelu_fusion: false, bos_token_id: 50256, chunk_size_feed_forward: 0, decoder_start_token_id: null, diversity_penalty: 0.0, do_sample: false, early_stopping: false, embedding_dropout_prob: 0.1, encoder_no_repeat_ngram_size: 0, eos_token_id: 50256, exponential_decay_length_penalty: null, ffn_hidden_size: 3072, forced_bos_token_id: null, forced_eos_token_id: null, hidden_layers: 12, hidden_size: 768, initializer_range: 0.02, is_encoder_decoder: false, layernorm_epsilon: 1.0e-05, length_penalty: 1.0, max_length: 20, max_seq_length: 1024, min_length: 0, no_repeat_ngram_size: 0, num_attention_heads: 12, num_beam_groups: 1, num_beams: 1, num_return_sequences: 1, output_dropout_prob: 0.1, output_scores: false, pad_token_id: 0, pretrained_model_path: null, remove_invalid_values: false, repetition_penalty: 1.0, scale_mask_softmax_fusion: false, sep_token_id: null, temperature: 1.0, top_k: 50, top_p: 1.0, typical_p: 1.0, use_cache: true, use_scaled_init_for_output_weights: true, vocab_size: 50257} +optim: + _target_: oneflow.nn.optimizer.adamw.AdamW + betas: [0.9, 0.999] + do_bias_correction: true + eps: 1.0e-08 + lr: 5.0e-05 + params: {_target_: libai.optim.get_default_optimizer_params, weight_decay_bias: 0.0, weight_decay_norm: 0.0} + weight_decay: 0.01 +tokenization: + append_eod: false + make_vocab_size_divisible_by: 128 + tokenizer: {_target_: libai.tokenizer.GPT2Tokenizer, do_chinese_wwm: true, do_lower_case: true, merges_file: /home/zhangxiaoyu/magicprompt/merges.txt, vocab_file: /home/zhangxiaoyu/magicprompt/vocab.json} +train: + activation_checkpoint: {enabled: false} + amp: {enabled: true} + checkpointer: {max_to_keep: 20, period: 8000} + consumed_train_samples: 0 + consumed_valid_samples: 0 + dist: {data_parallel_size: 1, num_gpus_per_node: 1, num_nodes: 1, pipeline_num_layers: 10000, pipeline_parallel_size: 1, tensor_parallel_size: 1} + evaluation: + enabled: true + eval_iter: 250 + eval_period: 4000 + evaluator: {_target_: libai.evaluation.PPLEvaluator} + global_batch_size: 4 + input_placement_device: cpu + load_weight: '' + log_period: 1 + nccl_fusion_max_ops: 24 + nccl_fusion_threshold_mb: 16 + num_accumulation_steps: 1 + output_dir: projects/MagicPrompt/oneflow_magicprompt + rdma_enabled: false + resume: false + samples: 40000 + scheduler: {_target_: libai.scheduler.WarmupExponentialLR, gamma: 1.0, warmup_factor: 0.0, warmup_iter: 0.0, warmup_method: linear} + seed: 1234 + start_iter: 0 + test_micro_batch_size: 4 + train_epoch: 33 + train_iter: 10000 + train_micro_batch_size: 4 + train_samples: null + warmup_ratio: 0 + zero_optimization: {enabled: false, stage: 1} diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721120.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721120.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..3bf803baa97d4366fa407e97dc9e2704e2157491 GIT binary patch literal 40 rcmb1OfPlsI-b$Pe+L|XE`ENQ(@g@}|X6EU+mZj#ESQ$0>Mr#28w{QyD literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721150.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721150.oneflow-32 new file mode 100644 index 0000000000000000000000000000000000000000..fb5654a1c3401e3ab6eed35fd6694aeb8245ec3f GIT binary patch literal 24449 zcmZwPd0b8TAIEV?rHyv`kR?i#?ALP7wcejlWG@nBjFK%z;bu3}BH6{DvL#Uzm2Jpc z(IP}jNQtsEl40;Wx8w2sp7;0M`TOo@=I!G1iGjYiSAdsiz;E+?EG+`! zWzLsW2EY7_A3ZU1(jPYdXKZd@YG4qsAi&Gd({I6&C6=qMbyJ*@y{)nOS-W^$>Y{Hq z>1SwbL0BePW!mE?aykQ^RVXnF>Nh#)fZ}_#a_bod3$8tw8}2G`z}38imFfgyJ%@)U}j+G!*6bh&+G;M-b*ZBM#L(g-z&HRLOqKef>4_ks{coq zza2aggsUw{&I(srS}Dqt4fldj%Zl`3;h}x~^+D)IJ1RWVCsgG?C&9NAXLgm;nvH15IPi-E075N`5nF< zUfK$T-7B^WLOoVEs$`ukY)!5K2#>uYIV9!DmF@YYI_vqIBeriynt z>7F2zts=cx7=32wUm)B@J1SJ~Fn>+IZ6K_mr9|k!?{HP?oA+RcUw2yM*PlauRycOg zELnAlj{<}rc9EPFa>Y#*J&JY~fpFGt(u;-eAC{>=c%F7tsNP|hhUYUuXtsw;ln4{~ z9qRUJH4=ndLMI4916DY5g{UMKR5WZ^r&UQiA zlohIX7_}{`3WR<1NX`m#YYY`<+S_#m;X~Ssg(fqa#evYhF_}w+4L_*lm+wtbg766~ zCBnyLF-rTy!<#{vpL0(Tny^Ck&orM3zNdgtH;?43aP%BQg^-Y{#k4G25ulPi!2(@SHN%QL%O1!389^Md+wXvzvJyhCM?e|5uiIJAu9 ztT5NfP@(IRX#+yTm!ua972&RjKzNpRRH)vemFnao5Kb*86D7hZeuuI8ikBcX)0!X% z%~+v&hrh>F*uW0m8j_q9y8SRvXn)^V2*O*m7YiRYabFL@4%%cc6{^qS>}RLKKp03% zi7<#ihdnpj?E|4t+%`dI&I;A%FnU_nH4x?>CpjxTmS&*XGUse82#>{+UMzH;^It3o zdz~O16{>eQBL4UDAdI7>L^zG#VfMTC4?wu{>^(tf!3tGZ=E|}*ubc|PkLO6v3cuUv zD>7?)&jI0@RMLxu5#uweKv+XNDpc?A%(yL+K{z9gOq2*^{0=MEWd?z8NQn9U`g3Th zN%(Jv<$)j^8cK3jIN_n5BB#-t6cC=Ky;%6szjz-Ajl;-XDpc=q(P4*9Ae>E0iLe#F z!x5%=V?fw^m75^6Vuc4~V`QPnFaHI?02RquVV7fi3i}x^Q8*-+^kQMg(*N8+SW7z; zhWwj{KbP)bO(sf&Un^pi-5u9@gRs_pn;>kaNx0SUa4ZNrd6JwJ?*CO!;pTE}H|$Vv zCh5h(>O(rlAdI0M3a^bkeh+py(Thx!2#a3CD5os&_#1@VJKht7%~@fT$yk|nMB~pO z4DUp8R%o?GSFt7H@FNf!+mK!?wA|CK286%U4uvksYySjc2`weU^G{=x(|h#9-w#{0 zFn>^g4qIpv+NHRdfba{=CBo%8ip0c`?jW4ol8hG%UqoHH55f%EQQ->xV7X7;=&m4a z(~3-#2>0?kym{^XY}nzocjE_&|EG(9W>U zOc0)<9TlqIY0BrVcLAYM4VfqruHbiQI;D3n5RT20u4R;w>a+g}1J0DK^*zeFUL%BNzMwbmTD>DZp!sR*m(-+ z#lkCfRVE-zr5zQX&QQspw^E!1;fSeZqD1J&@38sQA-S-_QvI!hu&pNH_kKkOLHMg7 z$ys56wwB^iCoeY;*3n)poTzWH3xu{3M2XOz-{JM_Dc3+)q-R!Ge-5oR2~{?aWFTy&PjXgx!s9!4KeH*`r@2gfv9RC! zbNF8BXF%psq5Ex>Trv0!e&TmDB&9?c!SC?TuYsl@eBNxlAZ)Km_*J_%-Y<1(L2_1@ zJmo*`&biWy@G4EAy;x{2?~Zqb+qNWgQCJ<^Yy<3Y6D=jes=NGMj9TsgfzYu$QV@2~ zB#bEW>;^kLR6%l9*sUMS8KY`OsuHc$Jz}l8y@F(pB=)M~j<)FoBj5 zp$osm_k&VSf$-v?JVDq|lhD4^e}_T%<}k@wVe|4Y+`^Iv=Rp{9g!E$J`GN^6t)qB z9)FTvEZlwA1@E<$(GGlfWp@H})`75r2gzAslu0f3>}1(05YDH)Sm=2nyB!Fx(~b(&@1>bDhvFx`)pT+N z65%}l)56YvLn9FSj?5E;U0C6NU8l)zU(Id}!XPEdS>dgxAGuF$mgCZL1?E*Xj5K_ zcN0IClAIOV|MDY0-YA{%{bUpru4u&HoK} z@43qTAY5k^DG0l=LiIVUUOcit>`>mETU#`I1Hza*lC#2NJ*v3}V~5NKp;bQV#lns0$sa(th;~%izCb0{TiM7Rgx6^) z5ytWFGvG(Pb?N%V^FD``T4;Szf-CAe?Z5j28<>HMTzh!jrT^ z;p&9qAt2OFAQL4*Uw((PQqBwpq5iR4LFlMSSZ()DAPDW_NX`n!=D*>%smGs!FpKtL z;dGyu_*|REaWa<*j~A)rGj|*4gYYpeCBonM9q#MB$`*vmH6~B$&tVT%*iY9*X54%7 zLlEv+OLA7Y?9*#5DK32y2nVery;xZEab`Ez;WgS(p(TH>O|ktZ{vChRdNNTWwC8s? zx>t_^5X#rP3PLAV=+VnvW_Q?YI|v7EAUP|{zE{cJ|Fm=k2(Qv!Ec_V%?=K+ixslAJ zLiG;YCC+aNJN%255}_@>!;G=-yMl0+&lW-0lNG9WILkS&4unVNken6T^r_@*^%E|E zu-C7o7YkkP48~83BeX+dlV1Yy@A&3($wY~;ivP6u;Sn?mgoo$m3c_BjP`$%l$>r%F z)SpLkRw(F&|WOO71-f0>@dog%thhZZ>dJG!*PD3ln7t*uhKNH$eB-Ihihpm5vt#5KD+m{1Yt*uErPHwD^#Du$nrBKAY5oka#nb^(=+Z_ zvzhqUwXRmA7YnO)SY83)P1;eR`kiKIanv~wPHsjfN`%h5@PeuQ2MBfU2-WX24Hm{Z zfpFTkrX}^~usyc4C^LFQ87 zNPdULHOYps!^5b{3Vs7Z_ zpc5c$wTtv(;o|bu0kFfXw4=iEN|oFuu6t(?PTNf;N`xo)9r`b6jCb{8k3~P)~GM5S`@ISRh zMNe1;!dhBNg!6gf7R6iq4R~Garcdk7;UHG1K8GER3ciAHTN{$I!t*^IaVfuj>jy%s zUq~+&>RG=$4?-2~P#7E%j-U81Xekl;^XIT-M&u&cVdBTJf^aY^RG&lrX{uZheyAlm zD+~!QVPo1bh99o zv%+2(^JNZivx7hw{fy+Su)O*KXKr#S2!x%VlU^(wK6RcA2)EG=h34K$d=}#?EhRz+ zeuodT!|`{Eo!9RQLXH)xclf7HZ4wBV+#oqCTyAxrTRt=88wg8jFBU2juC@i?pqpea z6$TWmT{S;T!42I7rT?375azda6^tBZVkexv=<9IguS}}!oD74E)}YG zcw*&(ogfUMr9^m&-{H3@T|R=aSaw$s4$~xj;}M8Y44oT5a#mP#>Ml1{t2I6`)L|g$ z#lrYSkNyY3m9(S6&No!@?7vnt0AUU-CBh_rho)K`z{#lp6mj4D9LH6e3RI42;rI|$QhDG^%o-#^qcI&}er!*j<7 z!V#K;3)@uSw-F}gk(?D~re<@`Y=6i>SWSDeP_eVc5D?DECv&MVi2sJ=y~I`yAbd~L;FlC#1S#aW#DoozlK{7QSVP(LJPDhL;7 zles9Yx$hYY!V+3ag!ga8D5s{k{{ljT+&h9$sYy70m8%vAt7y&&8*j+sVnT=EPu`w+ zWV~2tX0r1Z2-9gth3De=XQFQUeFI_Bd@@lY?8omgJu1Z)gbVf{B3{LLJ;2WPjXhczr!uAhi)x?cTFZEy;ykrxwR7rchC-n&#Mn* zfbcafB|!$Iq^_JVNS0Ftvp*~FV%SkI0Gmq6H|?;Szt zs!6y$esW(BPUuH+RyeKyb?(>FMOR>lUuiEE{_gnpG6>i9Cv&M#eGZR2ors@@x-wEq zggyBkYFjt10O56WecTfL3rdO$ys5eqN3N`y}Qr^ToJLn}cTaV1+2PGN;JhkDAAtx_g{@a9#Lv%;=+=eaYRii}~0voc68 z7S@cB^#@@Q?WnLF|JOBlW%5`Mj=V-DN`xPH;SuA-9w6*>->AI)98P70>Q`yw&z|^w zi|7X=XN4z{(z&_Sp%EbL|B&=zVS+w;RIV(IkFr7>L)*FAO`9gcKuxoU%9E1}tlDSlv!wWC} zotFeVJVHx}(1905CJZO+@sjw~o??mmezI6qJ zPiQF-I`KQ4bm7ux5RTY1S`f})g;qf`Wi85Hd|~CkbJK(32IuE*&pRHL0Bp!f~fa z&I)IoOy+X8x7h~5r?eLfL%Y>X2H~hgGM5TFbq`;4Q zwji9TN!Y%{;^iQ;T10YI7_}jpdvMP06$le)FBT5k+qDvej*H1$DlFi4IJEC>+Xad7Ec^42xqZE^*PKxe76jQdXq`6NjP!e z8hoeOLwm8XeZ0Oc?9g%wnM;N09j4xDVF$uLX(jXC%)NPeKL>Vrh4y0MqR>v?KsZoA=2BtDM!|CDH0LoOJVQ%~P<BT}t`x`Gn*nBMMP}sL? zEBqG+F4IyXOyPx-=Y*w#a8yU5%KCHY!wQZ1`^z5GB)fyq*oNe+a9ZdYuIArJd}1h` z_F|zyX)6BvG+jEAxhPzac)}ZYxSy61q54i#)kWWL*ZCETMrQaQAj3Agh~AGhgE~k@f_wmgbBi5 zS>gDsrLtE!j(9h*xf97*Vc5XaT;-z2T|ihud$DkaQHmeDN&|Y5xl~xl1ZDpbGI#0=Sl_q+!fkt>i07xBWxQ;mv2*uRm{>-ux}n-;GB32c8NCODz7@`ba2_js zJ#voBV(;TcAar>}a#ncjQWCe@%+3jRc!l<2;oH)`{Xy8KlFX&TzShC=iF$qTKFwlU zN`#+yp~=Li_&s3H;!r{8t4VlvmB$(o-YX%wM7So2JGQusBM7~ol3py_=iRIvggLaM zLNCK$`JMlkH->ZAx0Fnj2z~kIX*MRkc7+{|IG!a4{WJ-8INZeVuGPeooE5tICvidH i#Tg)sIYD}{(0YJ3K7HwsKsqX%@lhq0MPKYK7XBYpq^wK; literal 0 HcmV?d00001 diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt b/projects/MagicPrompt/oneflow_magicprompt/log.txt new file mode 100644 index 000000000..28eed1db1 --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/log.txt @@ -0,0 +1,916 @@ +[04/17 16:45:18] libai INFO: Rank of current process: 0. World size: 1 +[04/17 16:45:18] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[04/17 16:45:18] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = True + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[04/17 16:45:18] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[04/17 16:45:18] lb.engine.default INFO: > compiling dataset index builder ... +[04/17 16:45:18] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.045 seconds +[04/17 16:45:18] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.046 seconds +[04/17 16:45:20] lb.engine.default INFO: Prepare training, validating, testing set +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: reading document index... +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008423 seconds +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[04/17 16:45:20] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[04/17 16:45:20] libai INFO: > Start building model... +[04/17 16:45:20] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[04/17 16:45:20] libai INFO: >>> done with building model. Building time: 0.182 seconds +[04/17 16:45:20] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[04/17 16:45:20] lb.engine.trainer INFO: Starting training from iteration 0 +[04/17 16:45:21] lb.engine.trainer ERROR: Exception during training: +Traceback (most recent call last): + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train + self.run_step() + File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step + self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) + File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 284, in run_step + self.optimizer.clip_grad() + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/optim/optimizer.py", line 508, in clip_grad + clip_grad_norm_( + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/utils/clip_grad.py", line 123, in clip_grad_norm_ + [ + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/utils/clip_grad.py", line 124, in + flow.linalg.vector_norm( + File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/linalg.py", line 24, in vector_norm + return flow._C.vector_norm(self, ord, dim, keepdim, dtype=dtype) +RuntimeError: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are + op_name: sqrt_square_sum66 + op_type_name: sqrt_square_sum + DeviceType_Name: kMLU + DataType_Name of x_0: kFloat + DataType_Name of y_0: kFloat + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/functional/impl/math_functor.cpp", line 1579, in operator() + SqrtSquareSum(x) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 144, in Dispatch + Dispatch(op_expr, inputs, ctx) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 135, in Dispatch + Dispatch(op_expr, inputs, outputs.get(), ctx) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/op_interpreter/op_interpreter.cpp", line 96, in Apply + internal_->Apply(op_expr, inputs, outputs, ctx) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/op_interpreter/eager_global_op_interpreter.cpp", line 176, in Interpret + PhysicalRun([&](InstructionsBuilder* builder) -> Maybe ... output_eager_blob_objects), result, ctx, result->stream()); }) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/instructions_builder.h", line 167, in PhysicalRun + Build(&instructions_builder) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/instructions_builder.cpp", line 401, in Call + vm::OpCallInstructionPolicy::New( vm_stream, opkernel ... global_tensor_infer_result, ctx, *one::CurrentDevVmDepObjectConsumeMode()) + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/vm/op_call_instruction_policy.h", line 47, in New + ptr->Init() + File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/user/kernels/stateful_opkernel.cpp", line 918, in ChooseOpKernel + user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult(op_type_name, reg_ctx) +Error Type: oneflow.ErrorProto.op_kernel_not_found_error +[04/17 16:45:21] lb.engine.hooks INFO: Total training time: 0:00:00 (0:00:00 on hooks) +[04/17 16:45:48] libai INFO: Rank of current process: 0. World size: 1 +[04/17 16:45:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) +[04/17 16:45:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model +from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization +from configs.common.optim import optim + +from libai.scheduler import WarmupExponentialLR + +from configs.common.train import train +from configs.common.models.graph import graph + +graph.enabled=False +graph.debug = 2 + +vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" +merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" +train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = train_data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix + +# gpt2 model config +model.cfg.pretrained_model_path = None +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.output_dropout_prob = 0.1 +model.cfg.num_attention_heads = 12 +model.cfg.hidden_size = 768 +model.cfg.ffn_hidden_size = 4 * 768 +model.cfg.hidden_layers = 12 +model.cfg.max_seq_length = 1024 +model.cfg.initializer_range = 0.02 +model.cfg.vocab_size = 50257 +model.cfg.layernorm_epsilon = 1e-5 +model.cfg.use_scaled_init_for_output_weights = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False +model.cfg.apply_query_key_layer_scaling = True +model.cfg.apply_residual_post_layernorm = False +model.cfg.amp_enabled = True + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 5.0e-05 + +train.update( + dict( + output_dir="projects/MagicPrompt/oneflow_magicprompt", + train_micro_batch_size=4, + test_micro_batch_size=4, + train_epoch=33, + train_iter=10000, + log_period=1, + amp=dict(enabled=True), + warmup_ratio=0, + checkpointer=dict(period=8000, max_to_keep=20), + dist=dict( + data_parallel_size=4, + tensor_parallel_size=1, + pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, + # pipeline_num_layers=model.cfg.hidden_layers, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + warmup_iter=0.0, + ), + evaluation=dict( + enabled=True, + evaluator=LazyCall(PPLEvaluator)(), + eval_iter=250, + eval_period=4000, + ), + rdma_enabled=False, + ) +) + +[04/17 16:45:48] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml +[04/17 16:45:48] lb.engine.default INFO: > compiling dataset index builder ... +[04/17 16:45:48] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.045 seconds +[04/17 16:45:48] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.046 seconds +[04/17 16:45:50] lb.engine.default INFO: Prepare training, validating, testing set +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: building dataset index ... +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: reading sizes... +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: reading pointers... +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: reading document index... +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007694 seconds +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 +[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 +[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 +[04/17 16:45:50] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 +[04/17 16:45:50] libai INFO: > Start building model... +[04/17 16:45:50] lb.engine.default INFO: Model: +GPTForPreTraining( + (GPT_model): GPTModel( + (embeddings): GPTEmbedding( + (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) + (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) + (dropout): Dropout(p=0.1, inplace=False) + ) + (transformer): Transformer( + (layers): ModuleList( + (0): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (1): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (2): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (3): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (4): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (5): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (6): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (7): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (8): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (9): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (10): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (11): TransformerLayer( + (drop_path): Identity() + (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (self_attention): MultiheadAttention( + hidden_size=768, num_heads=12, is_cross_attention=False + (dropout): Dropout(p=0.1, inplace=False) + (output_dropout): Dropout(p=0.1, inplace=False) + (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) + (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) + ) + (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): MLP( + bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 + (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) + (activation_func): GeLUTanh() + (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (lm_head): LMLogits() + ) + (loss_func): GPTLoss( + (lm_loss): ParallelCrossEntropyLoss() + ) +) +[04/17 16:45:50] libai INFO: >>> done with building model. Building time: 0.199 seconds +[04/17 16:45:50] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR +[04/17 16:45:50] lb.engine.trainer INFO: Starting training from iteration 0 +[04/17 16:45:51] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0013 s/iter lr: 5.00e-05 +[04/17 16:45:52] lb.utils.events INFO: eta: 3 days, 20:51:01 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0012 s/iter lr: 5.00e-05 +[04/17 16:45:53] lb.utils.events INFO: eta: 3 days, 21:02:55 iteration: 2/335008 consumed_samples: 12 total_loss: 9.452 time: 0.9999 s/iter data_time: 0.0011 s/iter total_throughput: 4.00 samples/s lr: 5.00e-05 +[04/17 16:45:54] lb.utils.events INFO: eta: 3 days, 20:46:14 iteration: 3/335008 consumed_samples: 16 total_loss: 9.38 time: 0.9969 s/iter data_time: 0.0012 s/iter total_throughput: 4.01 samples/s lr: 5.00e-05 +[04/17 16:45:55] lb.utils.events INFO: eta: 3 days, 20:37:01 iteration: 4/335008 consumed_samples: 20 total_loss: 9.309 time: 0.9964 s/iter data_time: 0.0010 s/iter total_throughput: 4.01 samples/s lr: 5.00e-05 +[04/17 16:45:56] lb.utils.events INFO: eta: 3 days, 20:36:21 iteration: 5/335008 consumed_samples: 24 total_loss: 9.213 time: 0.9960 s/iter data_time: 0.0012 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:45:57] lb.utils.events INFO: eta: 3 days, 20:36:59 iteration: 6/335008 consumed_samples: 28 total_loss: 9.118 time: 0.9964 s/iter data_time: 0.0011 s/iter total_throughput: 4.01 samples/s lr: 5.00e-05 +[04/17 16:45:58] lb.utils.events INFO: eta: 3 days, 20:43:43 iteration: 7/335008 consumed_samples: 32 total_loss: 9.097 time: 0.9966 s/iter data_time: 0.0011 s/iter total_throughput: 4.01 samples/s lr: 5.00e-05 +[04/17 16:45:59] lb.utils.events INFO: eta: 3 days, 20:36:57 iteration: 8/335008 consumed_samples: 36 total_loss: 9.075 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:00] lb.utils.events INFO: eta: 3 days, 20:43:41 iteration: 9/335008 consumed_samples: 40 total_loss: 8.992 time: 0.9961 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:01] lb.utils.events INFO: eta: 3 days, 20:40:43 iteration: 10/335008 consumed_samples: 44 total_loss: 8.908 time: 0.9961 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:02] lb.utils.events INFO: eta: 3 days, 20:44:04 iteration: 11/335008 consumed_samples: 48 total_loss: 8.861 time: 0.9962 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:03] lb.utils.events INFO: eta: 3 days, 20:40:41 iteration: 12/335008 consumed_samples: 52 total_loss: 8.814 time: 0.9959 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:04] lb.utils.events INFO: eta: 3 days, 20:38:46 iteration: 13/335008 consumed_samples: 56 total_loss: 8.798 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:05] lb.utils.events INFO: eta: 3 days, 20:40:39 iteration: 14/335008 consumed_samples: 60 total_loss: 8.781 time: 0.9959 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:06] lb.utils.events INFO: eta: 3 days, 20:40:06 iteration: 15/335008 consumed_samples: 64 total_loss: 8.747 time: 0.9959 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:07] lb.utils.events INFO: eta: 3 days, 20:39:33 iteration: 16/335008 consumed_samples: 68 total_loss: 8.712 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:08] lb.utils.events INFO: eta: 3 days, 20:40:04 iteration: 17/335008 consumed_samples: 72 total_loss: 8.7 time: 0.9959 s/iter data_time: 0.0011 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:09] lb.utils.events INFO: eta: 3 days, 20:40:35 iteration: 18/335008 consumed_samples: 76 total_loss: 8.687 time: 0.9960 s/iter data_time: 0.0011 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:10] lb.utils.events INFO: eta: 3 days, 20:40:02 iteration: 19/335008 consumed_samples: 80 total_loss: 8.67 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:11] lb.utils.events INFO: eta: 3 days, 20:40:33 iteration: 20/335008 consumed_samples: 84 total_loss: 8.653 time: 0.9959 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:12] lb.utils.events INFO: eta: 3 days, 20:40:00 iteration: 21/335008 consumed_samples: 88 total_loss: 8.572 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:13] lb.utils.events INFO: eta: 3 days, 20:40:31 iteration: 22/335008 consumed_samples: 92 total_loss: 8.491 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:14] lb.utils.events INFO: eta: 3 days, 20:39:58 iteration: 23/335008 consumed_samples: 96 total_loss: 8.49 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:15] lb.utils.events INFO: eta: 3 days, 20:39:25 iteration: 24/335008 consumed_samples: 100 total_loss: 8.489 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:16] lb.utils.events INFO: eta: 3 days, 20:39:56 iteration: 25/335008 consumed_samples: 104 total_loss: 8.479 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:17] lb.utils.events INFO: eta: 3 days, 20:39:23 iteration: 26/335008 consumed_samples: 108 total_loss: 8.468 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:18] lb.utils.events INFO: eta: 3 days, 20:38:00 iteration: 27/335008 consumed_samples: 112 total_loss: 8.463 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:19] lb.utils.events INFO: eta: 3 days, 20:37:17 iteration: 28/335008 consumed_samples: 116 total_loss: 8.458 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:20] lb.utils.events INFO: eta: 3 days, 20:38:18 iteration: 29/335008 consumed_samples: 120 total_loss: 8.407 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:21] lb.utils.events INFO: eta: 3 days, 20:37:15 iteration: 30/335008 consumed_samples: 124 total_loss: 8.355 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:22] lb.utils.events INFO: eta: 3 days, 20:36:54 iteration: 31/335008 consumed_samples: 128 total_loss: 8.342 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:23] lb.utils.events INFO: eta: 3 days, 20:37:13 iteration: 32/335008 consumed_samples: 132 total_loss: 8.329 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:24] lb.utils.events INFO: eta: 3 days, 20:36:52 iteration: 33/335008 consumed_samples: 136 total_loss: 8.318 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:25] lb.utils.events INFO: eta: 3 days, 20:37:11 iteration: 34/335008 consumed_samples: 140 total_loss: 8.307 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:26] lb.utils.events INFO: eta: 3 days, 20:36:50 iteration: 35/335008 consumed_samples: 144 total_loss: 8.305 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:27] lb.utils.events INFO: eta: 3 days, 20:36:29 iteration: 36/335008 consumed_samples: 148 total_loss: 8.304 time: 0.9955 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:28] lb.utils.events INFO: eta: 3 days, 20:36:48 iteration: 37/335008 consumed_samples: 152 total_loss: 8.215 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:29] lb.utils.events INFO: eta: 3 days, 20:37:07 iteration: 38/335008 consumed_samples: 156 total_loss: 8.126 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:30] lb.utils.events INFO: eta: 3 days, 20:37:15 iteration: 39/335008 consumed_samples: 160 total_loss: 8.111 time: 0.9955 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:31] lb.utils.events INFO: eta: 3 days, 20:37:23 iteration: 40/335008 consumed_samples: 164 total_loss: 8.095 time: 0.9955 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:32] lb.utils.events INFO: eta: 3 days, 20:37:13 iteration: 41/335008 consumed_samples: 168 total_loss: 8.091 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:33] lb.utils.events INFO: eta: 3 days, 20:37:03 iteration: 42/335008 consumed_samples: 172 total_loss: 8.087 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:34] lb.utils.events INFO: eta: 3 days, 20:37:11 iteration: 43/335008 consumed_samples: 176 total_loss: 8.063 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:35] lb.utils.events INFO: eta: 3 days, 20:37:01 iteration: 44/335008 consumed_samples: 180 total_loss: 8.04 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:36] lb.utils.events INFO: eta: 3 days, 20:37:09 iteration: 45/335008 consumed_samples: 184 total_loss: 8.01 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:37] lb.utils.events INFO: eta: 3 days, 20:37:17 iteration: 46/335008 consumed_samples: 188 total_loss: 7.98 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:38] lb.utils.events INFO: eta: 3 days, 20:37:07 iteration: 47/335008 consumed_samples: 192 total_loss: 7.949 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:39] lb.utils.events INFO: eta: 3 days, 20:37:15 iteration: 48/335008 consumed_samples: 196 total_loss: 7.918 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:40] lb.utils.events INFO: eta: 3 days, 20:37:05 iteration: 49/335008 consumed_samples: 200 total_loss: 7.886 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:41] lb.utils.events INFO: eta: 3 days, 20:37:13 iteration: 50/335008 consumed_samples: 204 total_loss: 7.854 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:42] lb.utils.events INFO: eta: 3 days, 20:37:03 iteration: 51/335008 consumed_samples: 208 total_loss: 7.851 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:43] lb.utils.events INFO: eta: 3 days, 20:37:11 iteration: 52/335008 consumed_samples: 212 total_loss: 7.847 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:44] lb.utils.events INFO: eta: 3 days, 20:37:08 iteration: 53/335008 consumed_samples: 216 total_loss: 7.841 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:45] lb.utils.events INFO: eta: 3 days, 20:37:05 iteration: 54/335008 consumed_samples: 220 total_loss: 7.834 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:46] lb.utils.events INFO: eta: 3 days, 20:37:06 iteration: 55/335008 consumed_samples: 224 total_loss: 7.792 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:47] lb.utils.events INFO: eta: 3 days, 20:37:03 iteration: 56/335008 consumed_samples: 228 total_loss: 7.75 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:48] lb.utils.events INFO: eta: 3 days, 20:36:55 iteration: 57/335008 consumed_samples: 232 total_loss: 7.705 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:49] lb.utils.events INFO: eta: 3 days, 20:37:01 iteration: 58/335008 consumed_samples: 236 total_loss: 7.66 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:50] lb.utils.events INFO: eta: 3 days, 20:37:02 iteration: 59/335008 consumed_samples: 240 total_loss: 7.658 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:51] lb.utils.events INFO: eta: 3 days, 20:36:59 iteration: 60/335008 consumed_samples: 244 total_loss: 7.657 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:52] lb.utils.events INFO: eta: 3 days, 20:36:51 iteration: 61/335008 consumed_samples: 248 total_loss: 7.652 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:53] lb.utils.events INFO: eta: 3 days, 20:36:43 iteration: 62/335008 consumed_samples: 252 total_loss: 7.648 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:54] lb.utils.events INFO: eta: 3 days, 20:36:49 iteration: 63/335008 consumed_samples: 256 total_loss: 7.628 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:55] lb.utils.events INFO: eta: 3 days, 20:36:55 iteration: 64/335008 consumed_samples: 260 total_loss: 7.608 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:56] lb.utils.events INFO: eta: 3 days, 20:36:47 iteration: 65/335008 consumed_samples: 264 total_loss: 7.562 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:57] lb.utils.events INFO: eta: 3 days, 20:36:39 iteration: 66/335008 consumed_samples: 268 total_loss: 7.517 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:58] lb.utils.events INFO: eta: 3 days, 20:36:45 iteration: 67/335008 consumed_samples: 272 total_loss: 7.516 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:46:59] lb.utils.events INFO: eta: 3 days, 20:36:37 iteration: 68/335008 consumed_samples: 276 total_loss: 7.514 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:00] lb.utils.events INFO: eta: 3 days, 20:36:16 iteration: 69/335008 consumed_samples: 280 total_loss: 7.482 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:01] lb.utils.events INFO: eta: 3 days, 20:35:55 iteration: 70/335008 consumed_samples: 284 total_loss: 7.45 time: 0.9954 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:02] lb.utils.events INFO: eta: 3 days, 20:36:14 iteration: 71/335008 consumed_samples: 288 total_loss: 7.446 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:03] lb.utils.events INFO: eta: 3 days, 20:35:53 iteration: 72/335008 consumed_samples: 292 total_loss: 7.442 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:04] lb.utils.events INFO: eta: 3 days, 20:35:50 iteration: 73/335008 consumed_samples: 296 total_loss: 7.412 time: 0.9954 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:05] lb.utils.events INFO: eta: 3 days, 20:35:51 iteration: 74/335008 consumed_samples: 300 total_loss: 7.382 time: 0.9955 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:06] lb.utils.events INFO: eta: 3 days, 20:36:10 iteration: 75/335008 consumed_samples: 304 total_loss: 7.38 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:07] lb.utils.events INFO: eta: 3 days, 20:36:29 iteration: 76/335008 consumed_samples: 308 total_loss: 7.378 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:08] lb.utils.events INFO: eta: 3 days, 20:36:08 iteration: 77/335008 consumed_samples: 312 total_loss: 7.369 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:09] lb.utils.events INFO: eta: 3 days, 20:36:27 iteration: 78/335008 consumed_samples: 316 total_loss: 7.361 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:10] lb.utils.events INFO: eta: 3 days, 20:36:06 iteration: 79/335008 consumed_samples: 320 total_loss: 7.343 time: 0.9955 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:11] lb.utils.events INFO: eta: 3 days, 20:36:25 iteration: 80/335008 consumed_samples: 324 total_loss: 7.324 time: 0.9955 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:12] lb.utils.events INFO: eta: 3 days, 20:36:04 iteration: 81/335008 consumed_samples: 328 total_loss: 7.307 time: 0.9955 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:13] lb.utils.events INFO: eta: 3 days, 20:35:43 iteration: 82/335008 consumed_samples: 332 total_loss: 7.29 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:14] lb.utils.events INFO: eta: 3 days, 20:35:40 iteration: 83/335008 consumed_samples: 336 total_loss: 7.288 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:15] lb.utils.events INFO: eta: 3 days, 20:35:41 iteration: 84/335008 consumed_samples: 340 total_loss: 7.286 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:16] lb.utils.events INFO: eta: 3 days, 20:36:00 iteration: 85/335008 consumed_samples: 344 total_loss: 7.242 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:17] lb.utils.events INFO: eta: 3 days, 20:36:19 iteration: 86/335008 consumed_samples: 348 total_loss: 7.199 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:18] lb.utils.events INFO: eta: 3 days, 20:36:25 iteration: 87/335008 consumed_samples: 352 total_loss: 7.15 time: 0.9955 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:19] lb.utils.events INFO: eta: 3 days, 20:36:31 iteration: 88/335008 consumed_samples: 356 total_loss: 7.101 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:20] lb.utils.events INFO: eta: 3 days, 20:36:32 iteration: 89/335008 consumed_samples: 360 total_loss: 7.082 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:21] lb.utils.events INFO: eta: 3 days, 20:36:33 iteration: 90/335008 consumed_samples: 364 total_loss: 7.062 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:22] lb.utils.events INFO: eta: 3 days, 20:36:35 iteration: 91/335008 consumed_samples: 368 total_loss: 7.036 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:23] lb.utils.events INFO: eta: 3 days, 20:36:31 iteration: 92/335008 consumed_samples: 372 total_loss: 7.01 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:24] lb.utils.events INFO: eta: 3 days, 20:36:33 iteration: 93/335008 consumed_samples: 376 total_loss: 6.996 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:25] lb.utils.events INFO: eta: 3 days, 20:36:29 iteration: 94/335008 consumed_samples: 380 total_loss: 6.983 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:26] lb.utils.events INFO: eta: 3 days, 20:36:31 iteration: 95/335008 consumed_samples: 384 total_loss: 6.963 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:27] lb.utils.events INFO: eta: 3 days, 20:36:27 iteration: 96/335008 consumed_samples: 388 total_loss: 6.942 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:28] lb.utils.events INFO: eta: 3 days, 20:36:29 iteration: 97/335008 consumed_samples: 392 total_loss: 6.924 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:29] lb.utils.events INFO: eta: 3 days, 20:36:25 iteration: 98/335008 consumed_samples: 396 total_loss: 6.905 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:30] lb.utils.events INFO: eta: 3 days, 20:36:27 iteration: 99/335008 consumed_samples: 400 total_loss: 6.86 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:31] lb.utils.events INFO: eta: 3 days, 20:36:29 iteration: 100/335008 consumed_samples: 404 total_loss: 6.815 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:32] lb.utils.events INFO: eta: 3 days, 20:36:25 iteration: 101/335008 consumed_samples: 408 total_loss: 6.808 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:33] lb.utils.events INFO: eta: 3 days, 20:36:21 iteration: 102/335008 consumed_samples: 412 total_loss: 6.8 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:34] lb.utils.events INFO: eta: 3 days, 20:36:23 iteration: 103/335008 consumed_samples: 416 total_loss: 6.787 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:35] lb.utils.events INFO: eta: 3 days, 20:36:19 iteration: 104/335008 consumed_samples: 420 total_loss: 6.774 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:36] lb.utils.events INFO: eta: 3 days, 20:36:21 iteration: 105/335008 consumed_samples: 424 total_loss: 6.767 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:37] lb.utils.events INFO: eta: 3 days, 20:36:23 iteration: 106/335008 consumed_samples: 428 total_loss: 6.76 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:38] lb.utils.events INFO: eta: 3 days, 20:36:43 iteration: 107/335008 consumed_samples: 432 total_loss: 6.717 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:39] lb.utils.events INFO: eta: 3 days, 20:36:21 iteration: 108/335008 consumed_samples: 436 total_loss: 6.673 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:40] lb.utils.events INFO: eta: 3 days, 20:36:41 iteration: 109/335008 consumed_samples: 440 total_loss: 6.659 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:41] lb.utils.events INFO: eta: 3 days, 20:36:19 iteration: 110/335008 consumed_samples: 444 total_loss: 6.645 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:42] lb.utils.events INFO: eta: 3 days, 20:36:39 iteration: 111/335008 consumed_samples: 448 total_loss: 6.59 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:43] lb.utils.events INFO: eta: 3 days, 20:36:17 iteration: 112/335008 consumed_samples: 452 total_loss: 6.534 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:44] lb.utils.events INFO: eta: 3 days, 20:36:37 iteration: 113/335008 consumed_samples: 456 total_loss: 6.522 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:45] lb.utils.events INFO: eta: 3 days, 20:36:15 iteration: 114/335008 consumed_samples: 460 total_loss: 6.51 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:46] lb.utils.events INFO: eta: 3 days, 20:36:11 iteration: 115/335008 consumed_samples: 464 total_loss: 6.486 time: 0.9956 s/iter data_time: 0.0011 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:47] lb.utils.events INFO: eta: 3 days, 20:36:13 iteration: 116/335008 consumed_samples: 468 total_loss: 6.461 time: 0.9957 s/iter data_time: 0.0011 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 +[04/17 16:47:48] lb.utils.events INFO: eta: 3 days, 20:36:09 iteration: 117/335008 consumed_samples: 472 total_loss: 6.459 time: 0.9956 s/iter data_time: 0.0011 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 diff --git a/projects/MagicPrompt/oneflow_magicprompt/metrics.json b/projects/MagicPrompt/oneflow_magicprompt/metrics.json new file mode 100644 index 000000000..aff509e45 --- /dev/null +++ b/projects/MagicPrompt/oneflow_magicprompt/metrics.json @@ -0,0 +1,118 @@ +{"data_time": 0.0013080858625471592, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} +{"data_time": 0.0011704793432727456, "iteration": 1, "lr": 5e-05, "total_loss": 10.084524154663086} +{"data_time": 0.001032872823998332, "eta_seconds": 334975.14907222823, "iteration": 2, "lr": 5e-05, "time": 0.9999108940828592, "total_loss": 9.451911926269531} +{"data_time": 0.0011704793432727456, "eta_seconds": 333974.74642584706, "iteration": 3, "lr": 5e-05, "time": 0.9969276379561052, "total_loss": 9.380244731903076} +{"data_time": 0.001032872823998332, "eta_seconds": 333421.4348163572, "iteration": 4, "lr": 5e-05, "time": 0.995278952177614, "total_loss": 9.308577537536621} +{"data_time": 0.0011704793432727456, "eta_seconds": 333381.0690537989, "iteration": 5, "lr": 5e-05, "time": 0.9951614290475845, "total_loss": 9.213211059570312} +{"data_time": 0.001032872823998332, "eta_seconds": 333419.44425845286, "iteration": 6, "lr": 5e-05, "time": 0.995278952177614, "total_loss": 9.117844581604004} +{"data_time": 0.0010060754138976336, "eta_seconds": 333823.492100928, "iteration": 7, "lr": 5e-05, "time": 0.9964880361221731, "total_loss": 9.096528053283691} +{"data_time": 0.000979278003796935, "eta_seconds": 333417.4537005485, "iteration": 8, "lr": 5e-05, "time": 0.995278952177614, "total_loss": 9.075211524963379} +{"data_time": 0.0008644059998914599, "eta_seconds": 333821.49912485573, "iteration": 9, "lr": 5e-05, "time": 0.9964880361221731, "total_loss": 8.991516590118408} +{"data_time": 0.0007495339959859848, "eta_seconds": 333643.7865908735, "iteration": 10, "lr": 5e-05, "time": 0.9959605208132416, "total_loss": 8.907821655273438} +{"data_time": 0.0008644059998914599, "eta_seconds": 333844.5914174495, "iteration": 11, "lr": 5e-05, "time": 0.9965629184152931, "total_loss": 8.860907554626465} +{"data_time": 0.000979278003796935, "eta_seconds": 333641.79466983187, "iteration": 12, "lr": 5e-05, "time": 0.9959605208132416, "total_loss": 8.813993453979492} +{"data_time": 0.0010060754138976336, "eta_seconds": 333526.63800754934, "iteration": 13, "lr": 5e-05, "time": 0.9956197364954278, "total_loss": 8.797540187835693} +{"data_time": 0.000979278003796935, "eta_seconds": 333639.80274879024, "iteration": 14, "lr": 5e-05, "time": 0.9959605208132416, "total_loss": 8.781086921691895} +{"data_time": 0.0010060754138976336, "eta_seconds": 333606.8080229815, "iteration": 15, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.74678087234497} +{"data_time": 0.001032872823998332, "eta_seconds": 333573.81348821474, "iteration": 16, "lr": 5e-05, "time": 0.9957694788463414, "total_loss": 8.712474822998047} +{"data_time": 0.0010634929640218616, "eta_seconds": 333604.81629298185, "iteration": 17, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.699904918670654} +{"data_time": 0.001032872823998332, "eta_seconds": 333635.818906707, "iteration": 18, "lr": 5e-05, "time": 0.9959605208132416, "total_loss": 8.687335014343262} +{"data_time": 0.0010060754138976336, "eta_seconds": 333602.8245629822, "iteration": 19, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.670326232910156} +{"data_time": 0.0008878930239006877, "eta_seconds": 333633.82698566536, "iteration": 20, "lr": 5e-05, "time": 0.9959605208132416, "total_loss": 8.572020053863525} +{"data_time": 0.0008878930239006877, "eta_seconds": 333600.83283298253, "iteration": 21, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.490070819854736} +{"data_time": 0.0007828865200281143, "eta_seconds": 333631.83506462374, "iteration": 22, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.478554248809814} +{"data_time": 0.0007828865200281143, "eta_seconds": 333598.8411029829, "iteration": 23, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.46279764175415} +{"data_time": 0.0007860750192776322, "eta_seconds": 333565.847332384, "iteration": 24, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.406514644622803} +{"data_time": 0.0007860750192776322, "eta_seconds": 333596.8493729832, "iteration": 25, "lr": 5e-05, "time": 0.9962384009268135, "total_loss": 8.34181833267212} +{"data_time": 0.0007860750192776322, "eta_seconds": 333563.8557934263, "iteration": 26, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.317673206329346} +{"data_time": 0.0007860750192776322, "eta_seconds": 333480.7017122023, "iteration": 27, "lr": 5e-05, "time": 0.995359412394464, "total_loss": 8.30541181564331} +{"data_time": 0.0007860750192776322, "eta_seconds": 333437.3543028466, "iteration": 28, "lr": 5e-05, "time": 0.9955836314475164, "total_loss": 8.214956283569336} +{"data_time": 0.0007860750192776322, "eta_seconds": 333498.61369502614, "iteration": 29, "lr": 5e-05, "time": 0.9955836314475164, "total_loss": 8.110542297363281} +{"data_time": 0.0007860750192776322, "eta_seconds": 333435.3635072785, "iteration": 30, "lr": 5e-05, "time": 0.995173564995639, "total_loss": 8.091084957122803} +{"data_time": 0.0007860750192776322, "eta_seconds": 333414.46519707143, "iteration": 31, "lr": 5e-05, "time": 0.9948321470292285, "total_loss": 8.06343412399292} +{"data_time": 0.0007724534953013062, "eta_seconds": 333433.3727117104, "iteration": 32, "lr": 5e-05, "time": 0.995173564995639, "total_loss": 8.009804964065552} +{"data_time": 0.0007724534953013062, "eta_seconds": 333412.4745203352, "iteration": 33, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.94881272315979} +{"data_time": 0.0007724534953013062, "eta_seconds": 333431.3819161423, "iteration": 34, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.885828018188477} +{"data_time": 0.0007724534953013062, "eta_seconds": 333410.483843599, "iteration": 35, "lr": 5e-05, "time": 0.995035731466487, "total_loss": 7.850603103637695} +{"data_time": 0.0007724534953013062, "eta_seconds": 333389.58588988753, "iteration": 36, "lr": 5e-05, "time": 0.995035731466487, "total_loss": 7.840587615966797} +{"data_time": 0.0007724534953013062, "eta_seconds": 333408.49316686275, "iteration": 37, "lr": 5e-05, "time": 0.995035731466487, "total_loss": 7.791827201843262} +{"data_time": 0.0007724534953013062, "eta_seconds": 333427.4003250061, "iteration": 38, "lr": 5e-05, "time": 0.995035731466487, "total_loss": 7.7047717571258545} +{"data_time": 0.0007869779365137219, "eta_seconds": 333435.32243600953, "iteration": 39, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.658276796340942} +{"data_time": 0.0008322594221681356, "eta_seconds": 333443.244493769, "iteration": 40, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.65231728553772} +{"data_time": 0.0008322594221681356, "eta_seconds": 333433.3315871975, "iteration": 41, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.627635717391968} +{"data_time": 0.0008322594221681356, "eta_seconds": 333423.4187338699, "iteration": 42, "lr": 5e-05, "time": 0.99503821041435, "total_loss": 7.562398672103882} +{"data_time": 0.0007869779365137219, "eta_seconds": 333431.34073838545, "iteration": 43, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.515561819076538} +{"data_time": 0.0008322594221681356, "eta_seconds": 333421.4279383018, "iteration": 44, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.481845140457153} +{"data_time": 0.0007762979948893189, "eta_seconds": 333429.3498895734, "iteration": 45, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.446049690246582} +{"data_time": 0.0007762979948893189, "eta_seconds": 333437.27178760106, "iteration": 46, "lr": 5e-05, "time": 0.9954244060209021, "total_loss": 7.412311315536499} +{"data_time": 0.0007762979948893189, "eta_seconds": 333427.35904076137, "iteration": 47, "lr": 5e-05, "time": 0.9954244060209021, "total_loss": 7.380153179168701} +{"data_time": 0.0007762979948893189, "eta_seconds": 333435.2808855451, "iteration": 48, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.36942195892334} +{"data_time": 0.0007897350005805492, "eta_seconds": 333425.3681919493, "iteration": 49, "lr": 5e-05, "time": 0.9953473334899172, "total_loss": 7.342585325241089} +{"data_time": 0.0008361989166587591, "eta_seconds": 333433.2899834891, "iteration": 50, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.3071043491363525} +{"data_time": 0.0007897350005805492, "eta_seconds": 333423.3773431373, "iteration": 51, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.287859916687012} +{"data_time": 0.0008361989166587591, "eta_seconds": 333431.2990814331, "iteration": 52, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.242215156555176} +{"data_time": 0.0008361989166587591, "eta_seconds": 333428.3312471411, "iteration": 53, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.149794101715088} +{"data_time": 0.0008361989166587591, "eta_seconds": 333425.3634246262, "iteration": 54, "lr": 5e-05, "time": 0.9954451394733042, "total_loss": 7.0815417766571045} +{"data_time": 0.0007897350005805492, "eta_seconds": 333426.3403568622, "iteration": 55, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.035864591598511} +{"data_time": 0.0007897350005805492, "eta_seconds": 333423.3725461243, "iteration": 56, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 6.996466159820557} +{"data_time": 0.0008276200387626886, "eta_seconds": 333415.43243699125, "iteration": 57, "lr": 5e-05, "time": 0.9955334250116721, "total_loss": 6.962551116943359} +{"data_time": 0.0008890159660950303, "eta_seconds": 333421.38166762237, "iteration": 58, "lr": 5e-05, "time": 0.9957565000513569, "total_loss": 6.923537731170654} +{"data_time": 0.0008890159660950303, "eta_seconds": 333422.3585763043, "iteration": 59, "lr": 5e-05, "time": 0.9957565000513569, "total_loss": 6.860137939453125} +{"data_time": 0.0008890159660950303, "eta_seconds": 333419.39078912046, "iteration": 60, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.807734727859497} +{"data_time": 0.0008276200387626886, "eta_seconds": 333411.45076292125, "iteration": 61, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.787320613861084} +{"data_time": 0.0008890159660950303, "eta_seconds": 333403.51077818894, "iteration": 62, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.767277956008911} +{"data_time": 0.0009069920051842928, "eta_seconds": 333409.45992588624, "iteration": 63, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.716540575027466} +{"data_time": 0.0008890159660950303, "eta_seconds": 333415.40903211664, "iteration": 64, "lr": 5e-05, "time": 0.9955666183959693, "total_loss": 6.658992528915405} +{"data_time": 0.0008890159660950303, "eta_seconds": 333407.46908885124, "iteration": 65, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.589884996414185} +{"data_time": 0.0008890159660950303, "eta_seconds": 333399.52918705274, "iteration": 66, "lr": 5e-05, "time": 0.9952532815514132, "total_loss": 6.522307395935059} +{"data_time": 0.0009069920051842928, "eta_seconds": 333405.47825181624, "iteration": 67, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.4856483936309814} +{"data_time": 0.0009069920051842928, "eta_seconds": 333397.53839148465, "iteration": 68, "lr": 5e-05, "time": 0.9952110890299082, "total_loss": 6.458977699279785} +{"data_time": 0.0009069920051842928, "eta_seconds": 333376.64233908313, "iteration": 69, "lr": 5e-05, "time": 0.9952110890299082, "total_loss": 6.435722827911377} +{"data_time": 0.0009069920051842928, "eta_seconds": 333355.7464055135, "iteration": 70, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.397557020187378} +{"data_time": 0.0009069920051842928, "eta_seconds": 333374.6516623469, "iteration": 71, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.370809316635132} +{"data_time": 0.0009069920051842928, "eta_seconds": 333353.75584760914, "iteration": 72, "lr": 5e-05, "time": 0.9948076440487057, "total_loss": 6.359989166259766} +{"data_time": 0.0008708799723535776, "eta_seconds": 333350.0763972802, "iteration": 73, "lr": 5e-05, "time": 0.994340977515094, "total_loss": 6.351602792739868} +{"data_time": 0.0008656535064801574, "eta_seconds": 333351.7652897048, "iteration": 74, "lr": 5e-05, "time": 0.9948076440487057, "total_loss": 6.33983039855957} +{"data_time": 0.0009017655393108726, "eta_seconds": 333370.67030887445, "iteration": 75, "lr": 5e-05, "time": 0.9948076440487057, "total_loss": 6.322655439376831} +{"data_time": 0.0009017655393108726, "eta_seconds": 333389.57520921226, "iteration": 76, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.28255558013916} +{"data_time": 0.0009017655393108726, "eta_seconds": 333368.6796321382, "iteration": 77, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.254348039627075} +{"data_time": 0.0008656535064801574, "eta_seconds": 333387.58441364416, "iteration": 78, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.229153394699097} +{"data_time": 0.000795042491517961, "eta_seconds": 333366.688955402, "iteration": 79, "lr": 5e-05, "time": 0.994889885536395, "total_loss": 6.190645933151245} +{"data_time": 0.000795042491517961, "eta_seconds": 333385.59361807606, "iteration": 80, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.163251161575317} +{"data_time": 0.0008184099569916725, "eta_seconds": 333364.6982786658, "iteration": 81, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.1208086013793945} +{"data_time": 0.0008184099569916725, "eta_seconds": 333343.80305808736, "iteration": 82, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.085185289382935} +{"data_time": 0.0008184099569916725, "eta_seconds": 333340.12368789874, "iteration": 83, "lr": 5e-05, "time": 0.9950716779567301, "total_loss": 6.049023389816284} +{"data_time": 0.0008462514961138368, "eta_seconds": 333341.812500183, "iteration": 84, "lr": 5e-05, "time": 0.9950716779567301, "total_loss": 5.980789422988892} +{"data_time": 0.0008462514961138368, "eta_seconds": 333360.7169251933, "iteration": 85, "lr": 5e-05, "time": 0.9950759758939967, "total_loss": 5.924633741378784} +{"data_time": 0.0009010385256260633, "eta_seconds": 333379.62123137177, "iteration": 86, "lr": 5e-05, "time": 0.995314322062768, "total_loss": 5.882171154022217} +{"data_time": 0.0008462514961138368, "eta_seconds": 333385.5698814662, "iteration": 87, "lr": 5e-05, "time": 0.995314322062768, "total_loss": 5.864503860473633} +{"data_time": 0.0008516814559698105, "eta_seconds": 333391.51849009376, "iteration": 88, "lr": 5e-05, "time": 0.9955310776131228, "total_loss": 5.848897457122803} +{"data_time": 0.0009010385256260633, "eta_seconds": 333392.4952221201, "iteration": 89, "lr": 5e-05, "time": 0.9959323420189321, "total_loss": 5.83418869972229} +{"data_time": 0.0009010385256260633, "eta_seconds": 333393.47194236936, "iteration": 90, "lr": 5e-05, "time": 0.9959382290253416, "total_loss": 5.8199005126953125} +{"data_time": 0.0009010385256260633, "eta_seconds": 333395.55138673866, "iteration": 91, "lr": 5e-05, "time": 0.9959382290253416, "total_loss": 5.7816338539123535} +{"data_time": 0.0008516814559698105, "eta_seconds": 333391.4810403134, "iteration": 92, "lr": 5e-05, "time": 0.9959382290253416, "total_loss": 5.732883930206299} +{"data_time": 0.0008981249993667006, "eta_seconds": 333393.5604663205, "iteration": 93, "lr": 5e-05, "time": 0.9964770964579657, "total_loss": 5.694352626800537} +{"data_time": 0.0008563114097341895, "eta_seconds": 333389.4901382574, "iteration": 94, "lr": 5e-05, "time": 0.996143406489864, "total_loss": 5.668796062469482} +{"data_time": 0.0008044379064813256, "eta_seconds": 333391.5695459023, "iteration": 95, "lr": 5e-05, "time": 0.9966910509392619, "total_loss": 5.646285057067871} +{"data_time": 0.0008044379064813256, "eta_seconds": 333387.4992362014, "iteration": 96, "lr": 5e-05, "time": 0.9961521835066378, "total_loss": 5.641308546066284} +{"data_time": 0.0008044379064813256, "eta_seconds": 333389.57862548414, "iteration": 97, "lr": 5e-05, "time": 0.9967898100148886, "total_loss": 5.636359930038452} +{"data_time": 0.0008563114097341895, "eta_seconds": 333385.5083341454, "iteration": 98, "lr": 5e-05, "time": 0.9961521835066378, "total_loss": 5.636359930038452} +{"data_time": 0.0009056684793904424, "eta_seconds": 333387.58770506596, "iteration": 99, "lr": 5e-05, "time": 0.9961569545557722, "total_loss": 5.631777763366699} +{"data_time": 0.0009056684793904424, "eta_seconds": 333389.6670576243, "iteration": 100, "lr": 5e-05, "time": 0.9961569545557722, "total_loss": 5.621784687042236} +{"data_time": 0.0009087480138987303, "eta_seconds": 333385.5967846478, "iteration": 101, "lr": 5e-05, "time": 0.9961569545557722, "total_loss": 5.610760450363159} +{"data_time": 0.0009087480138987303, "eta_seconds": 333381.52653003344, "iteration": 102, "lr": 5e-05, "time": 0.9961569545557722, "total_loss": 5.5903894901275635} +{"data_time": 0.000937630538828671, "eta_seconds": 333383.6058642296, "iteration": 103, "lr": 5e-05, "time": 0.9970831935061142, "total_loss": 5.5698912143707275} +{"data_time": 0.0009058344876393676, "eta_seconds": 333379.53562797746, "iteration": 104, "lr": 5e-05, "time": 0.9961569545557722, "total_loss": 5.5698912143707275} +{"data_time": 0.0009058344876393676, "eta_seconds": 333381.61494381144, "iteration": 105, "lr": 5e-05, "time": 0.9968412938760594, "total_loss": 5.565072059631348} +{"data_time": 0.0008784519741311669, "eta_seconds": 333383.6942412832, "iteration": 106, "lr": 5e-05, "time": 0.9966005973983556, "total_loss": 5.5545384883880615} +{"data_time": 0.0008784519741311669, "eta_seconds": 333403.3578926348, "iteration": 107, "lr": 5e-05, "time": 0.9962673855479807, "total_loss": 5.547092437744141} +{"data_time": 0.0008784519741311669, "eta_seconds": 333381.70330250286, "iteration": 108, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.544716119766235} +{"data_time": 0.0008784519741311669, "eta_seconds": 333401.3668304796, "iteration": 109, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.515957355499268} +{"data_time": 0.0009058344876393676, "eta_seconds": 333379.7123637225, "iteration": 110, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.470178127288818} +{"data_time": 0.000937630538828671, "eta_seconds": 333399.37576832436, "iteration": 111, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.440483808517456} +{"data_time": 0.000937630538828671, "eta_seconds": 333377.72142494214, "iteration": 112, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.419479846954346} +{"data_time": 0.0009715190390124917, "eta_seconds": 333397.38470616913, "iteration": 113, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.406724214553833} +{"data_time": 0.0009715190390124917, "eta_seconds": 333375.7304861618, "iteration": 114, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.406724214553833} +{"data_time": 0.0010186959989368916, "eta_seconds": 333371.66034172056, "iteration": 115, "lr": 5e-05, "time": 0.9952741335146129, "total_loss": 5.401771306991577} +{"data_time": 0.0010186959989368916, "eta_seconds": 333373.7395473814, "iteration": 116, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.377491474151611} +{"data_time": 0.0009428025223314762, "eta_seconds": 333369.6694213024, "iteration": 117, "lr": 5e-05, "time": 0.9952741335146129, "total_loss": 5.355705738067627} From 8030159ec382ba4df009497909d49e54dfec5725 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Mon, 17 Apr 2023 16:48:20 +0800 Subject: [PATCH 23/25] refine --- .../oneflow_magicprompt/config.yaml | 77 -- .../events.out.tfevents.1681721120.oneflow-32 | Bin 40 -> 0 bytes .../events.out.tfevents.1681721150.oneflow-32 | Bin 24449 -> 0 bytes .../MagicPrompt/oneflow_magicprompt/log.txt | 916 ------------------ .../oneflow_magicprompt/metrics.json | 118 --- 5 files changed, 1111 deletions(-) delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/config.yaml delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721120.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721150.oneflow-32 delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/log.txt delete mode 100644 projects/MagicPrompt/oneflow_magicprompt/metrics.json diff --git a/projects/MagicPrompt/oneflow_magicprompt/config.yaml b/projects/MagicPrompt/oneflow_magicprompt/config.yaml deleted file mode 100644 index 1397e0fc2..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/config.yaml +++ /dev/null @@ -1,77 +0,0 @@ -dataloader: - train: - _target_: libai.data.build_nlp_train_val_test_loader - dataset: - - _target_: libai.data.datasets.GPT2Dataset - data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence - indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} - max_seq_length: 1024 - name: gpt-2 - seed: 1234 - num_workers: 4 - splits: - - [949.0, 50.0, 1.0] - train_val_test_num_samples: null - weights: [1.0] -ds: - _target_: libai.data.datasets.GPT2Dataset - data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence - indexed_dataset: {_target_: libai.data.data_utils.get_indexed_dataset, data_impl: mmap, data_prefix: /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence, skip_warmup: false} - max_seq_length: 1024 - name: gpt-2 - seed: 1234 -graph: - auto_parallel: {enable_auto_parallel_ignore_user_sbp_config: false, enabled: false, sbp_collector: false, trunk_algo: true} - debug: 2 - enabled: false - eval_graph: {_target_: libai.models.utils.GraphBase, is_train: false} - global_mode: {enabled: false} - train_graph: {_target_: libai.models.utils.GraphBase, is_train: true} -model: - _target_: projects.MagicPrompt.gpt2.GPTForPreTraining - cfg: {amp_enabled: true, apply_query_key_layer_scaling: true, apply_residual_post_layernorm: false, attention_dropout_prob: 0.1, bias_dropout_fusion: false, bias_gelu_fusion: false, bos_token_id: 50256, chunk_size_feed_forward: 0, decoder_start_token_id: null, diversity_penalty: 0.0, do_sample: false, early_stopping: false, embedding_dropout_prob: 0.1, encoder_no_repeat_ngram_size: 0, eos_token_id: 50256, exponential_decay_length_penalty: null, ffn_hidden_size: 3072, forced_bos_token_id: null, forced_eos_token_id: null, hidden_layers: 12, hidden_size: 768, initializer_range: 0.02, is_encoder_decoder: false, layernorm_epsilon: 1.0e-05, length_penalty: 1.0, max_length: 20, max_seq_length: 1024, min_length: 0, no_repeat_ngram_size: 0, num_attention_heads: 12, num_beam_groups: 1, num_beams: 1, num_return_sequences: 1, output_dropout_prob: 0.1, output_scores: false, pad_token_id: 0, pretrained_model_path: null, remove_invalid_values: false, repetition_penalty: 1.0, scale_mask_softmax_fusion: false, sep_token_id: null, temperature: 1.0, top_k: 50, top_p: 1.0, typical_p: 1.0, use_cache: true, use_scaled_init_for_output_weights: true, vocab_size: 50257} -optim: - _target_: oneflow.nn.optimizer.adamw.AdamW - betas: [0.9, 0.999] - do_bias_correction: true - eps: 1.0e-08 - lr: 5.0e-05 - params: {_target_: libai.optim.get_default_optimizer_params, weight_decay_bias: 0.0, weight_decay_norm: 0.0} - weight_decay: 0.01 -tokenization: - append_eod: false - make_vocab_size_divisible_by: 128 - tokenizer: {_target_: libai.tokenizer.GPT2Tokenizer, do_chinese_wwm: true, do_lower_case: true, merges_file: /home/zhangxiaoyu/magicprompt/merges.txt, vocab_file: /home/zhangxiaoyu/magicprompt/vocab.json} -train: - activation_checkpoint: {enabled: false} - amp: {enabled: true} - checkpointer: {max_to_keep: 20, period: 8000} - consumed_train_samples: 0 - consumed_valid_samples: 0 - dist: {data_parallel_size: 1, num_gpus_per_node: 1, num_nodes: 1, pipeline_num_layers: 10000, pipeline_parallel_size: 1, tensor_parallel_size: 1} - evaluation: - enabled: true - eval_iter: 250 - eval_period: 4000 - evaluator: {_target_: libai.evaluation.PPLEvaluator} - global_batch_size: 4 - input_placement_device: cpu - load_weight: '' - log_period: 1 - nccl_fusion_max_ops: 24 - nccl_fusion_threshold_mb: 16 - num_accumulation_steps: 1 - output_dir: projects/MagicPrompt/oneflow_magicprompt - rdma_enabled: false - resume: false - samples: 40000 - scheduler: {_target_: libai.scheduler.WarmupExponentialLR, gamma: 1.0, warmup_factor: 0.0, warmup_iter: 0.0, warmup_method: linear} - seed: 1234 - start_iter: 0 - test_micro_batch_size: 4 - train_epoch: 33 - train_iter: 10000 - train_micro_batch_size: 4 - train_samples: null - warmup_ratio: 0 - zero_optimization: {enabled: false, stage: 1} diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721120.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721120.oneflow-32 deleted file mode 100644 index 3bf803baa97d4366fa407e97dc9e2704e2157491..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$Pe+L|XE`ENQ(@g@}|X6EU+mZj#ESQ$0>Mr#28w{QyD diff --git a/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721150.oneflow-32 b/projects/MagicPrompt/oneflow_magicprompt/events.out.tfevents.1681721150.oneflow-32 deleted file mode 100644 index fb5654a1c3401e3ab6eed35fd6694aeb8245ec3f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24449 zcmZwPd0b8TAIEV?rHyv`kR?i#?ALP7wcejlWG@nBjFK%z;bu3}BH6{DvL#Uzm2Jpc z(IP}jNQtsEl40;Wx8w2sp7;0M`TOo@=I!G1iGjYiSAdsiz;E+?EG+`! zWzLsW2EY7_A3ZU1(jPYdXKZd@YG4qsAi&Gd({I6&C6=qMbyJ*@y{)nOS-W^$>Y{Hq z>1SwbL0BePW!mE?aykQ^RVXnF>Nh#)fZ}_#a_bod3$8tw8}2G`z}38imFfgyJ%@)U}j+G!*6bh&+G;M-b*ZBM#L(g-z&HRLOqKef>4_ks{coq zza2aggsUw{&I(srS}Dqt4fldj%Zl`3;h}x~^+D)IJ1RWVCsgG?C&9NAXLgm;nvH15IPi-E075N`5nF< zUfK$T-7B^WLOoVEs$`ukY)!5K2#>uYIV9!DmF@YYI_vqIBeriynt z>7F2zts=cx7=32wUm)B@J1SJ~Fn>+IZ6K_mr9|k!?{HP?oA+RcUw2yM*PlauRycOg zELnAlj{<}rc9EPFa>Y#*J&JY~fpFGt(u;-eAC{>=c%F7tsNP|hhUYUuXtsw;ln4{~ z9qRUJH4=ndLMI4916DY5g{UMKR5WZ^r&UQiA zlohIX7_}{`3WR<1NX`m#YYY`<+S_#m;X~Ssg(fqa#evYhF_}w+4L_*lm+wtbg766~ zCBnyLF-rTy!<#{vpL0(Tny^Ck&orM3zNdgtH;?43aP%BQg^-Y{#k4G25ulPi!2(@SHN%QL%O1!389^Md+wXvzvJyhCM?e|5uiIJAu9 ztT5NfP@(IRX#+yTm!ua972&RjKzNpRRH)vemFnao5Kb*86D7hZeuuI8ikBcX)0!X% z%~+v&hrh>F*uW0m8j_q9y8SRvXn)^V2*O*m7YiRYabFL@4%%cc6{^qS>}RLKKp03% zi7<#ihdnpj?E|4t+%`dI&I;A%FnU_nH4x?>CpjxTmS&*XGUse82#>{+UMzH;^It3o zdz~O16{>eQBL4UDAdI7>L^zG#VfMTC4?wu{>^(tf!3tGZ=E|}*ubc|PkLO6v3cuUv zD>7?)&jI0@RMLxu5#uweKv+XNDpc?A%(yL+K{z9gOq2*^{0=MEWd?z8NQn9U`g3Th zN%(Jv<$)j^8cK3jIN_n5BB#-t6cC=Ky;%6szjz-Ajl;-XDpc=q(P4*9Ae>E0iLe#F z!x5%=V?fw^m75^6Vuc4~V`QPnFaHI?02RquVV7fi3i}x^Q8*-+^kQMg(*N8+SW7z; zhWwj{KbP)bO(sf&Un^pi-5u9@gRs_pn;>kaNx0SUa4ZNrd6JwJ?*CO!;pTE}H|$Vv zCh5h(>O(rlAdI0M3a^bkeh+py(Thx!2#a3CD5os&_#1@VJKht7%~@fT$yk|nMB~pO z4DUp8R%o?GSFt7H@FNf!+mK!?wA|CK286%U4uvksYySjc2`weU^G{=x(|h#9-w#{0 zFn>^g4qIpv+NHRdfba{=CBo%8ip0c`?jW4ol8hG%UqoHH55f%EQQ->xV7X7;=&m4a z(~3-#2>0?kym{^XY}nzocjE_&|EG(9W>U zOc0)<9TlqIY0BrVcLAYM4VfqruHbiQI;D3n5RT20u4R;w>a+g}1J0DK^*zeFUL%BNzMwbmTD>DZp!sR*m(-+ z#lkCfRVE-zr5zQX&QQspw^E!1;fSeZqD1J&@38sQA-S-_QvI!hu&pNH_kKkOLHMg7 z$ys56wwB^iCoeY;*3n)poTzWH3xu{3M2XOz-{JM_Dc3+)q-R!Ge-5oR2~{?aWFTy&PjXgx!s9!4KeH*`r@2gfv9RC! zbNF8BXF%psq5Ex>Trv0!e&TmDB&9?c!SC?TuYsl@eBNxlAZ)Km_*J_%-Y<1(L2_1@ zJmo*`&biWy@G4EAy;x{2?~Zqb+qNWgQCJ<^Yy<3Y6D=jes=NGMj9TsgfzYu$QV@2~ zB#bEW>;^kLR6%l9*sUMS8KY`OsuHc$Jz}l8y@F(pB=)M~j<)FoBj5 zp$osm_k&VSf$-v?JVDq|lhD4^e}_T%<}k@wVe|4Y+`^Iv=Rp{9g!E$J`GN^6t)qB z9)FTvEZlwA1@E<$(GGlfWp@H})`75r2gzAslu0f3>}1(05YDH)Sm=2nyB!Fx(~b(&@1>bDhvFx`)pT+N z65%}l)56YvLn9FSj?5E;U0C6NU8l)zU(Id}!XPEdS>dgxAGuF$mgCZL1?E*Xj5K_ zcN0IClAIOV|MDY0-YA{%{bUpru4u&HoK} z@43qTAY5k^DG0l=LiIVUUOcit>`>mETU#`I1Hza*lC#2NJ*v3}V~5NKp;bQV#lns0$sa(th;~%izCb0{TiM7Rgx6^) z5ytWFGvG(Pb?N%V^FD``T4;Szf-CAe?Z5j28<>HMTzh!jrT^ z;p&9qAt2OFAQL4*Uw((PQqBwpq5iR4LFlMSSZ()DAPDW_NX`n!=D*>%smGs!FpKtL z;dGyu_*|REaWa<*j~A)rGj|*4gYYpeCBonM9q#MB$`*vmH6~B$&tVT%*iY9*X54%7 zLlEv+OLA7Y?9*#5DK32y2nVery;xZEab`Ez;WgS(p(TH>O|ktZ{vChRdNNTWwC8s? zx>t_^5X#rP3PLAV=+VnvW_Q?YI|v7EAUP|{zE{cJ|Fm=k2(Qv!Ec_V%?=K+ixslAJ zLiG;YCC+aNJN%255}_@>!;G=-yMl0+&lW-0lNG9WILkS&4unVNken6T^r_@*^%E|E zu-C7o7YkkP48~83BeX+dlV1Yy@A&3($wY~;ivP6u;Sn?mgoo$m3c_BjP`$%l$>r%F z)SpLkRw(F&|WOO71-f0>@dog%thhZZ>dJG!*PD3ln7t*uhKNH$eB-Ihihpm5vt#5KD+m{1Yt*uErPHwD^#Du$nrBKAY5oka#nb^(=+Z_ zvzhqUwXRmA7YnO)SY83)P1;eR`kiKIanv~wPHsjfN`%h5@PeuQ2MBfU2-WX24Hm{Z zfpFTkrX}^~usyc4C^LFQ87 zNPdULHOYps!^5b{3Vs7Z_ zpc5c$wTtv(;o|bu0kFfXw4=iEN|oFuu6t(?PTNf;N`xo)9r`b6jCb{8k3~P)~GM5S`@ISRh zMNe1;!dhBNg!6gf7R6iq4R~Garcdk7;UHG1K8GER3ciAHTN{$I!t*^IaVfuj>jy%s zUq~+&>RG=$4?-2~P#7E%j-U81Xekl;^XIT-M&u&cVdBTJf^aY^RG&lrX{uZheyAlm zD+~!QVPo1bh99o zv%+2(^JNZivx7hw{fy+Su)O*KXKr#S2!x%VlU^(wK6RcA2)EG=h34K$d=}#?EhRz+ zeuodT!|`{Eo!9RQLXH)xclf7HZ4wBV+#oqCTyAxrTRt=88wg8jFBU2juC@i?pqpea z6$TWmT{S;T!42I7rT?375azda6^tBZVkexv=<9IguS}}!oD74E)}YG zcw*&(ogfUMr9^m&-{H3@T|R=aSaw$s4$~xj;}M8Y44oT5a#mP#>Ml1{t2I6`)L|g$ z#lrYSkNyY3m9(S6&No!@?7vnt0AUU-CBh_rho)K`z{#lp6mj4D9LH6e3RI42;rI|$QhDG^%o-#^qcI&}er!*j<7 z!V#K;3)@uSw-F}gk(?D~re<@`Y=6i>SWSDeP_eVc5D?DECv&MVi2sJ=y~I`yAbd~L;FlC#1S#aW#DoozlK{7QSVP(LJPDhL;7 zles9Yx$hYY!V+3ag!ga8D5s{k{{ljT+&h9$sYy70m8%vAt7y&&8*j+sVnT=EPu`w+ zWV~2tX0r1Z2-9gth3De=XQFQUeFI_Bd@@lY?8omgJu1Z)gbVf{B3{LLJ;2WPjXhczr!uAhi)x?cTFZEy;ykrxwR7rchC-n&#Mn* zfbcafB|!$Iq^_JVNS0Ftvp*~FV%SkI0Gmq6H|?;Szt zs!6y$esW(BPUuH+RyeKyb?(>FMOR>lUuiEE{_gnpG6>i9Cv&M#eGZR2ors@@x-wEq zggyBkYFjt10O56WecTfL3rdO$ys5eqN3N`y}Qr^ToJLn}cTaV1+2PGN;JhkDAAtx_g{@a9#Lv%;=+=eaYRii}~0voc68 z7S@cB^#@@Q?WnLF|JOBlW%5`Mj=V-DN`xPH;SuA-9w6*>->AI)98P70>Q`yw&z|^w zi|7X=XN4z{(z&_Sp%EbL|B&=zVS+w;RIV(IkFr7>L)*FAO`9gcKuxoU%9E1}tlDSlv!wWC} zotFeVJVHx}(1905CJZO+@sjw~o??mmezI6qJ zPiQF-I`KQ4bm7ux5RTY1S`f})g;qf`Wi85Hd|~CkbJK(32IuE*&pRHL0Bp!f~fa z&I)IoOy+X8x7h~5r?eLfL%Y>X2H~hgGM5TFbq`;4Q zwji9TN!Y%{;^iQ;T10YI7_}jpdvMP06$le)FBT5k+qDvej*H1$DlFi4IJEC>+Xad7Ec^42xqZE^*PKxe76jQdXq`6NjP!e z8hoeOLwm8XeZ0Oc?9g%wnM;N09j4xDVF$uLX(jXC%)NPeKL>Vrh4y0MqR>v?KsZoA=2BtDM!|CDH0LoOJVQ%~P<BT}t`x`Gn*nBMMP}sL? zEBqG+F4IyXOyPx-=Y*w#a8yU5%KCHY!wQZ1`^z5GB)fyq*oNe+a9ZdYuIArJd}1h` z_F|zyX)6BvG+jEAxhPzac)}ZYxSy61q54i#)kWWL*ZCETMrQaQAj3Agh~AGhgE~k@f_wmgbBi5 zS>gDsrLtE!j(9h*xf97*Vc5XaT;-z2T|ihud$DkaQHmeDN&|Y5xl~xl1ZDpbGI#0=Sl_q+!fkt>i07xBWxQ;mv2*uRm{>-ux}n-;GB32c8NCODz7@`ba2_js zJ#voBV(;TcAar>}a#ncjQWCe@%+3jRc!l<2;oH)`{Xy8KlFX&TzShC=iF$qTKFwlU zN`#+yp~=Li_&s3H;!r{8t4VlvmB$(o-YX%wM7So2JGQusBM7~ol3py_=iRIvggLaM zLNCK$`JMlkH->ZAx0Fnj2z~kIX*MRkc7+{|IG!a4{WJ-8INZeVuGPeooE5tICvidH i#Tg)sIYD}{(0YJ3K7HwsKsqX%@lhq0MPKYK7XBYpq^wK; diff --git a/projects/MagicPrompt/oneflow_magicprompt/log.txt b/projects/MagicPrompt/oneflow_magicprompt/log.txt deleted file mode 100644 index 28eed1db1..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/log.txt +++ /dev/null @@ -1,916 +0,0 @@ -[04/17 16:45:18] libai INFO: Rank of current process: 0. World size: 1 -[04/17 16:45:18] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[04/17 16:45:18] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = True - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[04/17 16:45:18] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[04/17 16:45:18] lb.engine.default INFO: > compiling dataset index builder ... -[04/17 16:45:18] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.045 seconds -[04/17 16:45:18] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.046 seconds -[04/17 16:45:20] lb.engine.default INFO: Prepare training, validating, testing set -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: reading document index... -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.008423 seconds -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[04/17 16:45:20] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[04/17 16:45:20] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[04/17 16:45:20] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[04/17 16:45:20] libai INFO: > Start building model... -[04/17 16:45:20] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[04/17 16:45:20] libai INFO: >>> done with building model. Building time: 0.182 seconds -[04/17 16:45:20] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[04/17 16:45:20] lb.engine.trainer INFO: Starting training from iteration 0 -[04/17 16:45:21] lb.engine.trainer ERROR: Exception during training: -Traceback (most recent call last): - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 146, in train - self.run_step() - File "/home/zhangxiaoyu/libai/libai/engine/default.py", line 491, in run_step - self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device) - File "/home/zhangxiaoyu/libai/libai/engine/trainer.py", line 284, in run_step - self.optimizer.clip_grad() - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/optim/optimizer.py", line 508, in clip_grad - clip_grad_norm_( - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/utils/clip_grad.py", line 123, in clip_grad_norm_ - [ - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/nn/utils/clip_grad.py", line 124, in - flow.linalg.vector_norm( - File "/home/zhangxiaoyu/oneflow-cambricon/python/oneflow/linalg.py", line 24, in vector_norm - return flow._C.vector_norm(self, ord, dim, keepdim, dtype=dtype) -RuntimeError: Cannot find the kernel matching Current OperatorConf. The Info of OperatorConf are - op_name: sqrt_square_sum66 - op_type_name: sqrt_square_sum - DeviceType_Name: kMLU - DataType_Name of x_0: kFloat - DataType_Name of y_0: kFloat - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/functional/impl/math_functor.cpp", line 1579, in operator() - SqrtSquareSum(x) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 144, in Dispatch - Dispatch(op_expr, inputs, ctx) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 135, in Dispatch - Dispatch(op_expr, inputs, outputs.get(), ctx) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/op_interpreter/op_interpreter.cpp", line 96, in Apply - internal_->Apply(op_expr, inputs, outputs, ctx) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/op_interpreter/eager_global_op_interpreter.cpp", line 176, in Interpret - PhysicalRun([&](InstructionsBuilder* builder) -> Maybe ... output_eager_blob_objects), result, ctx, result->stream()); }) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/instructions_builder.h", line 167, in PhysicalRun - Build(&instructions_builder) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/framework/instructions_builder.cpp", line 401, in Call - vm::OpCallInstructionPolicy::New( vm_stream, opkernel ... global_tensor_infer_result, ctx, *one::CurrentDevVmDepObjectConsumeMode()) - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/core/vm/op_call_instruction_policy.h", line 47, in New - ptr->Init() - File "/home/zhangxiaoyu/oneflow-cambricon/oneflow/user/kernels/stateful_opkernel.cpp", line 918, in ChooseOpKernel - user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult(op_type_name, reg_ctx) -Error Type: oneflow.ErrorProto.op_kernel_not_found_error -[04/17 16:45:21] lb.engine.hooks INFO: Total training time: 0:00:00 (0:00:00 on hooks) -[04/17 16:45:48] libai INFO: Rank of current process: 0. World size: 1 -[04/17 16:45:48] libai INFO: Command line arguments: Namespace(config_file='projects/MagicPrompt/configs/gpt2_training.py', eval_only=False, fast_dev_run=False, opts=[], resume=False) -[04/17 16:45:48] libai INFO: Contents of args.config_file=projects/MagicPrompt/configs/gpt2_training.py: -from libai.config import LazyCall -from libai.evaluation import PPLEvaluator -from projects.MagicPrompt.configs.gpt2_inference import pretrain_model as model -from projects.MagicPrompt.configs.gpt2_dataset import dataloader, tokenization -from configs.common.optim import optim - -from libai.scheduler import WarmupExponentialLR - -from configs.common.train import train -from configs.common.models.graph import graph - -graph.enabled=False -graph.debug = 2 - -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" - -tokenization.tokenizer.vocab_file = vocab_file -tokenization.tokenizer.merges_file = merge_files -dataloader.train.dataset[0].data_prefix = train_data_prefix -dataloader.train.dataset[0].indexed_dataset.data_prefix = train_data_prefix - -# gpt2 model config -model.cfg.pretrained_model_path = None -model.cfg.embedding_dropout_prob = 0.1 -model.cfg.attention_dropout_prob = 0.1 -model.cfg.output_dropout_prob = 0.1 -model.cfg.num_attention_heads = 12 -model.cfg.hidden_size = 768 -model.cfg.ffn_hidden_size = 4 * 768 -model.cfg.hidden_layers = 12 -model.cfg.max_seq_length = 1024 -model.cfg.initializer_range = 0.02 -model.cfg.vocab_size = 50257 -model.cfg.layernorm_epsilon = 1e-5 -model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = False -model.cfg.bias_dropout_fusion = False -model.cfg.scale_mask_softmax_fusion = False -model.cfg.apply_query_key_layer_scaling = True -model.cfg.apply_residual_post_layernorm = False -model.cfg.amp_enabled = True - -train.input_placement_device = "cpu" - -train.dist.pipeline_num_layers = model.cfg.hidden_layers - -for ds in dataloader.train.dataset: - ds.max_seq_length = model.cfg.max_seq_length - -optim.lr = 5.0e-05 - -train.update( - dict( - output_dir="projects/MagicPrompt/oneflow_magicprompt", - train_micro_batch_size=4, - test_micro_batch_size=4, - train_epoch=33, - train_iter=10000, - log_period=1, - amp=dict(enabled=True), - warmup_ratio=0, - checkpointer=dict(period=8000, max_to_keep=20), - dist=dict( - data_parallel_size=4, - tensor_parallel_size=1, - pipeline_parallel_size=1, - # pipeline_num_layers = 12, - # custom_pipeline_stage_id = [0] * 6 + [1] * 6, - # pipeline_num_layers=model.cfg.hidden_layers, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - warmup_iter=0.0, - ), - evaluation=dict( - enabled=True, - evaluator=LazyCall(PPLEvaluator)(), - eval_iter=250, - eval_period=4000, - ), - rdma_enabled=False, - ) -) - -[04/17 16:45:48] libai INFO: Full config saved to projects/MagicPrompt/oneflow_magicprompt/config.yaml -[04/17 16:45:48] lb.engine.default INFO: > compiling dataset index builder ... -[04/17 16:45:48] lb.engine.default INFO: >>> done with dataset index builder. Compilation time: 0.045 seconds -[04/17 16:45:48] lb.engine.default INFO: >>> done with compiling. Compilation time: 0.046 seconds -[04/17 16:45:50] lb.engine.default INFO: Prepare training, validating, testing set -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: building dataset index ... -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: warming up index mmap file... -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: reading sizes... -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: reading pointers... -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: reading document index... -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: warming up data mmap file... -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: creating numpy buffer of mmap... -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: creating memory view of numpy buffer... -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: Finished creating indexed dataset in 0.007694 seconds -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: indexed dataset stats: -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: number of documents: 73718 -[04/17 16:45:50] lb.data.data_utils.indexed_dataset INFO: number of sentences: 106365 -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_doc_idx.npy -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_sample_idx.npy -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_40000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.004 seconds -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 40608 -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 9 -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_doc_idx.npy -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_sample_idx.npy -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_3000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading doc-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_doc_idx.npy -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading sample-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_sample_idx.npy -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: > loading shuffle-idx mapping from /home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence_gpt-2_indexmap_1000ns_1024sl_1234s_shuffle_idx.npy -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: loaded indexed file in 0.001 seconds -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of samples: 4512 -[04/17 16:45:50] lb.data.datasets.gpt_dataset INFO: total number of epochs: 1 -[04/17 16:45:50] lb.engine.default INFO: Auto-scaling the config to train.train_iter=335008, train.warmup_iter=0 -[04/17 16:45:50] libai INFO: > Start building model... -[04/17 16:45:50] lb.engine.default INFO: Model: -GPTForPreTraining( - (GPT_model): GPTModel( - (embeddings): GPTEmbedding( - (token_embeddings): VocabEmbedding(num_embeddings=50304, embedding_dim=768) - (position_embeddings): Embedding(num_embeddings=1024, embedding_dim=768) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layers): ModuleList( - (0): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (1): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (2): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (3): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (4): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (5): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (6): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (7): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (8): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (9): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (10): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - (11): TransformerLayer( - (drop_path): Identity() - (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (self_attention): MultiheadAttention( - hidden_size=768, num_heads=12, is_cross_attention=False - (dropout): Dropout(p=0.1, inplace=False) - (output_dropout): Dropout(p=0.1, inplace=False) - (query_key_value): Linear1D(in_features=768, out_features=2304, bias=True, parallel=col) - (dense): Linear1D(in_features=768, out_features=768, bias=True, parallel=row) - ) - (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - (mlp): MLP( - bias_gelu_fusion=False, bias_dropout_fusion=False, dropout=0.1 - (dense_h_to_4h): Linear1D(in_features=768, out_features=3072, bias=True, parallel=col) - (activation_func): GeLUTanh() - (dense_4h_to_h): Linear1D(in_features=3072, out_features=768, bias=True, parallel=row) - (dropout): Dropout(p=0.1, inplace=False) - ) - ) - ) - (layernorm_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) - ) - (lm_head): LMLogits() - ) - (loss_func): GPTLoss( - (lm_loss): ParallelCrossEntropyLoss() - ) -) -[04/17 16:45:50] libai INFO: >>> done with building model. Building time: 0.199 seconds -[04/17 16:45:50] lb.scheduler.lr_scheduler WARNING: warmup iters equals to zero, return ExponentialLR -[04/17 16:45:50] lb.engine.trainer INFO: Starting training from iteration 0 -[04/17 16:45:51] lb.utils.events INFO: iteration: 0/335008 consumed_samples: 4 total_loss: 10.86 data_time: 0.0013 s/iter lr: 5.00e-05 -[04/17 16:45:52] lb.utils.events INFO: eta: 3 days, 20:51:01 iteration: 1/335008 consumed_samples: 8 total_loss: 10.08 data_time: 0.0012 s/iter lr: 5.00e-05 -[04/17 16:45:53] lb.utils.events INFO: eta: 3 days, 21:02:55 iteration: 2/335008 consumed_samples: 12 total_loss: 9.452 time: 0.9999 s/iter data_time: 0.0011 s/iter total_throughput: 4.00 samples/s lr: 5.00e-05 -[04/17 16:45:54] lb.utils.events INFO: eta: 3 days, 20:46:14 iteration: 3/335008 consumed_samples: 16 total_loss: 9.38 time: 0.9969 s/iter data_time: 0.0012 s/iter total_throughput: 4.01 samples/s lr: 5.00e-05 -[04/17 16:45:55] lb.utils.events INFO: eta: 3 days, 20:37:01 iteration: 4/335008 consumed_samples: 20 total_loss: 9.309 time: 0.9964 s/iter data_time: 0.0010 s/iter total_throughput: 4.01 samples/s lr: 5.00e-05 -[04/17 16:45:56] lb.utils.events INFO: eta: 3 days, 20:36:21 iteration: 5/335008 consumed_samples: 24 total_loss: 9.213 time: 0.9960 s/iter data_time: 0.0012 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:45:57] lb.utils.events INFO: eta: 3 days, 20:36:59 iteration: 6/335008 consumed_samples: 28 total_loss: 9.118 time: 0.9964 s/iter data_time: 0.0011 s/iter total_throughput: 4.01 samples/s lr: 5.00e-05 -[04/17 16:45:58] lb.utils.events INFO: eta: 3 days, 20:43:43 iteration: 7/335008 consumed_samples: 32 total_loss: 9.097 time: 0.9966 s/iter data_time: 0.0011 s/iter total_throughput: 4.01 samples/s lr: 5.00e-05 -[04/17 16:45:59] lb.utils.events INFO: eta: 3 days, 20:36:57 iteration: 8/335008 consumed_samples: 36 total_loss: 9.075 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:00] lb.utils.events INFO: eta: 3 days, 20:43:41 iteration: 9/335008 consumed_samples: 40 total_loss: 8.992 time: 0.9961 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:01] lb.utils.events INFO: eta: 3 days, 20:40:43 iteration: 10/335008 consumed_samples: 44 total_loss: 8.908 time: 0.9961 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:02] lb.utils.events INFO: eta: 3 days, 20:44:04 iteration: 11/335008 consumed_samples: 48 total_loss: 8.861 time: 0.9962 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:03] lb.utils.events INFO: eta: 3 days, 20:40:41 iteration: 12/335008 consumed_samples: 52 total_loss: 8.814 time: 0.9959 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:04] lb.utils.events INFO: eta: 3 days, 20:38:46 iteration: 13/335008 consumed_samples: 56 total_loss: 8.798 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:05] lb.utils.events INFO: eta: 3 days, 20:40:39 iteration: 14/335008 consumed_samples: 60 total_loss: 8.781 time: 0.9959 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:06] lb.utils.events INFO: eta: 3 days, 20:40:06 iteration: 15/335008 consumed_samples: 64 total_loss: 8.747 time: 0.9959 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:07] lb.utils.events INFO: eta: 3 days, 20:39:33 iteration: 16/335008 consumed_samples: 68 total_loss: 8.712 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:08] lb.utils.events INFO: eta: 3 days, 20:40:04 iteration: 17/335008 consumed_samples: 72 total_loss: 8.7 time: 0.9959 s/iter data_time: 0.0011 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:09] lb.utils.events INFO: eta: 3 days, 20:40:35 iteration: 18/335008 consumed_samples: 76 total_loss: 8.687 time: 0.9960 s/iter data_time: 0.0011 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:10] lb.utils.events INFO: eta: 3 days, 20:40:02 iteration: 19/335008 consumed_samples: 80 total_loss: 8.67 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:11] lb.utils.events INFO: eta: 3 days, 20:40:33 iteration: 20/335008 consumed_samples: 84 total_loss: 8.653 time: 0.9959 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:12] lb.utils.events INFO: eta: 3 days, 20:40:00 iteration: 21/335008 consumed_samples: 88 total_loss: 8.572 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:13] lb.utils.events INFO: eta: 3 days, 20:40:31 iteration: 22/335008 consumed_samples: 92 total_loss: 8.491 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:14] lb.utils.events INFO: eta: 3 days, 20:39:58 iteration: 23/335008 consumed_samples: 96 total_loss: 8.49 time: 0.9958 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:15] lb.utils.events INFO: eta: 3 days, 20:39:25 iteration: 24/335008 consumed_samples: 100 total_loss: 8.489 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:16] lb.utils.events INFO: eta: 3 days, 20:39:56 iteration: 25/335008 consumed_samples: 104 total_loss: 8.479 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:17] lb.utils.events INFO: eta: 3 days, 20:39:23 iteration: 26/335008 consumed_samples: 108 total_loss: 8.468 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:18] lb.utils.events INFO: eta: 3 days, 20:38:00 iteration: 27/335008 consumed_samples: 112 total_loss: 8.463 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:19] lb.utils.events INFO: eta: 3 days, 20:37:17 iteration: 28/335008 consumed_samples: 116 total_loss: 8.458 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:20] lb.utils.events INFO: eta: 3 days, 20:38:18 iteration: 29/335008 consumed_samples: 120 total_loss: 8.407 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:21] lb.utils.events INFO: eta: 3 days, 20:37:15 iteration: 30/335008 consumed_samples: 124 total_loss: 8.355 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:22] lb.utils.events INFO: eta: 3 days, 20:36:54 iteration: 31/335008 consumed_samples: 128 total_loss: 8.342 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:23] lb.utils.events INFO: eta: 3 days, 20:37:13 iteration: 32/335008 consumed_samples: 132 total_loss: 8.329 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:24] lb.utils.events INFO: eta: 3 days, 20:36:52 iteration: 33/335008 consumed_samples: 136 total_loss: 8.318 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:25] lb.utils.events INFO: eta: 3 days, 20:37:11 iteration: 34/335008 consumed_samples: 140 total_loss: 8.307 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:26] lb.utils.events INFO: eta: 3 days, 20:36:50 iteration: 35/335008 consumed_samples: 144 total_loss: 8.305 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:27] lb.utils.events INFO: eta: 3 days, 20:36:29 iteration: 36/335008 consumed_samples: 148 total_loss: 8.304 time: 0.9955 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:28] lb.utils.events INFO: eta: 3 days, 20:36:48 iteration: 37/335008 consumed_samples: 152 total_loss: 8.215 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:29] lb.utils.events INFO: eta: 3 days, 20:37:07 iteration: 38/335008 consumed_samples: 156 total_loss: 8.126 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:30] lb.utils.events INFO: eta: 3 days, 20:37:15 iteration: 39/335008 consumed_samples: 160 total_loss: 8.111 time: 0.9955 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:31] lb.utils.events INFO: eta: 3 days, 20:37:23 iteration: 40/335008 consumed_samples: 164 total_loss: 8.095 time: 0.9955 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:32] lb.utils.events INFO: eta: 3 days, 20:37:13 iteration: 41/335008 consumed_samples: 168 total_loss: 8.091 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:33] lb.utils.events INFO: eta: 3 days, 20:37:03 iteration: 42/335008 consumed_samples: 172 total_loss: 8.087 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:34] lb.utils.events INFO: eta: 3 days, 20:37:11 iteration: 43/335008 consumed_samples: 176 total_loss: 8.063 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:35] lb.utils.events INFO: eta: 3 days, 20:37:01 iteration: 44/335008 consumed_samples: 180 total_loss: 8.04 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:36] lb.utils.events INFO: eta: 3 days, 20:37:09 iteration: 45/335008 consumed_samples: 184 total_loss: 8.01 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:37] lb.utils.events INFO: eta: 3 days, 20:37:17 iteration: 46/335008 consumed_samples: 188 total_loss: 7.98 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:38] lb.utils.events INFO: eta: 3 days, 20:37:07 iteration: 47/335008 consumed_samples: 192 total_loss: 7.949 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:39] lb.utils.events INFO: eta: 3 days, 20:37:15 iteration: 48/335008 consumed_samples: 196 total_loss: 7.918 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:40] lb.utils.events INFO: eta: 3 days, 20:37:05 iteration: 49/335008 consumed_samples: 200 total_loss: 7.886 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:41] lb.utils.events INFO: eta: 3 days, 20:37:13 iteration: 50/335008 consumed_samples: 204 total_loss: 7.854 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:42] lb.utils.events INFO: eta: 3 days, 20:37:03 iteration: 51/335008 consumed_samples: 208 total_loss: 7.851 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:43] lb.utils.events INFO: eta: 3 days, 20:37:11 iteration: 52/335008 consumed_samples: 212 total_loss: 7.847 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:44] lb.utils.events INFO: eta: 3 days, 20:37:08 iteration: 53/335008 consumed_samples: 216 total_loss: 7.841 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:45] lb.utils.events INFO: eta: 3 days, 20:37:05 iteration: 54/335008 consumed_samples: 220 total_loss: 7.834 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:46] lb.utils.events INFO: eta: 3 days, 20:37:06 iteration: 55/335008 consumed_samples: 224 total_loss: 7.792 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:47] lb.utils.events INFO: eta: 3 days, 20:37:03 iteration: 56/335008 consumed_samples: 228 total_loss: 7.75 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:48] lb.utils.events INFO: eta: 3 days, 20:36:55 iteration: 57/335008 consumed_samples: 232 total_loss: 7.705 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:49] lb.utils.events INFO: eta: 3 days, 20:37:01 iteration: 58/335008 consumed_samples: 236 total_loss: 7.66 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:50] lb.utils.events INFO: eta: 3 days, 20:37:02 iteration: 59/335008 consumed_samples: 240 total_loss: 7.658 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:51] lb.utils.events INFO: eta: 3 days, 20:36:59 iteration: 60/335008 consumed_samples: 244 total_loss: 7.657 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:52] lb.utils.events INFO: eta: 3 days, 20:36:51 iteration: 61/335008 consumed_samples: 248 total_loss: 7.652 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:53] lb.utils.events INFO: eta: 3 days, 20:36:43 iteration: 62/335008 consumed_samples: 252 total_loss: 7.648 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:54] lb.utils.events INFO: eta: 3 days, 20:36:49 iteration: 63/335008 consumed_samples: 256 total_loss: 7.628 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:55] lb.utils.events INFO: eta: 3 days, 20:36:55 iteration: 64/335008 consumed_samples: 260 total_loss: 7.608 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:56] lb.utils.events INFO: eta: 3 days, 20:36:47 iteration: 65/335008 consumed_samples: 264 total_loss: 7.562 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:57] lb.utils.events INFO: eta: 3 days, 20:36:39 iteration: 66/335008 consumed_samples: 268 total_loss: 7.517 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:58] lb.utils.events INFO: eta: 3 days, 20:36:45 iteration: 67/335008 consumed_samples: 272 total_loss: 7.516 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:46:59] lb.utils.events INFO: eta: 3 days, 20:36:37 iteration: 68/335008 consumed_samples: 276 total_loss: 7.514 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:00] lb.utils.events INFO: eta: 3 days, 20:36:16 iteration: 69/335008 consumed_samples: 280 total_loss: 7.482 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:01] lb.utils.events INFO: eta: 3 days, 20:35:55 iteration: 70/335008 consumed_samples: 284 total_loss: 7.45 time: 0.9954 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:02] lb.utils.events INFO: eta: 3 days, 20:36:14 iteration: 71/335008 consumed_samples: 288 total_loss: 7.446 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:03] lb.utils.events INFO: eta: 3 days, 20:35:53 iteration: 72/335008 consumed_samples: 292 total_loss: 7.442 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:04] lb.utils.events INFO: eta: 3 days, 20:35:50 iteration: 73/335008 consumed_samples: 296 total_loss: 7.412 time: 0.9954 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:05] lb.utils.events INFO: eta: 3 days, 20:35:51 iteration: 74/335008 consumed_samples: 300 total_loss: 7.382 time: 0.9955 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:06] lb.utils.events INFO: eta: 3 days, 20:36:10 iteration: 75/335008 consumed_samples: 304 total_loss: 7.38 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:07] lb.utils.events INFO: eta: 3 days, 20:36:29 iteration: 76/335008 consumed_samples: 308 total_loss: 7.378 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:08] lb.utils.events INFO: eta: 3 days, 20:36:08 iteration: 77/335008 consumed_samples: 312 total_loss: 7.369 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:09] lb.utils.events INFO: eta: 3 days, 20:36:27 iteration: 78/335008 consumed_samples: 316 total_loss: 7.361 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:10] lb.utils.events INFO: eta: 3 days, 20:36:06 iteration: 79/335008 consumed_samples: 320 total_loss: 7.343 time: 0.9955 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:11] lb.utils.events INFO: eta: 3 days, 20:36:25 iteration: 80/335008 consumed_samples: 324 total_loss: 7.324 time: 0.9955 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:12] lb.utils.events INFO: eta: 3 days, 20:36:04 iteration: 81/335008 consumed_samples: 328 total_loss: 7.307 time: 0.9955 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:13] lb.utils.events INFO: eta: 3 days, 20:35:43 iteration: 82/335008 consumed_samples: 332 total_loss: 7.29 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:14] lb.utils.events INFO: eta: 3 days, 20:35:40 iteration: 83/335008 consumed_samples: 336 total_loss: 7.288 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:15] lb.utils.events INFO: eta: 3 days, 20:35:41 iteration: 84/335008 consumed_samples: 340 total_loss: 7.286 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:16] lb.utils.events INFO: eta: 3 days, 20:36:00 iteration: 85/335008 consumed_samples: 344 total_loss: 7.242 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:17] lb.utils.events INFO: eta: 3 days, 20:36:19 iteration: 86/335008 consumed_samples: 348 total_loss: 7.199 time: 0.9955 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:18] lb.utils.events INFO: eta: 3 days, 20:36:25 iteration: 87/335008 consumed_samples: 352 total_loss: 7.15 time: 0.9955 s/iter data_time: 0.0008 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:19] lb.utils.events INFO: eta: 3 days, 20:36:31 iteration: 88/335008 consumed_samples: 356 total_loss: 7.101 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:20] lb.utils.events INFO: eta: 3 days, 20:36:32 iteration: 89/335008 consumed_samples: 360 total_loss: 7.082 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:21] lb.utils.events INFO: eta: 3 days, 20:36:33 iteration: 90/335008 consumed_samples: 364 total_loss: 7.062 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:22] lb.utils.events INFO: eta: 3 days, 20:36:35 iteration: 91/335008 consumed_samples: 368 total_loss: 7.036 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:23] lb.utils.events INFO: eta: 3 days, 20:36:31 iteration: 92/335008 consumed_samples: 372 total_loss: 7.01 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:24] lb.utils.events INFO: eta: 3 days, 20:36:33 iteration: 93/335008 consumed_samples: 376 total_loss: 6.996 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:25] lb.utils.events INFO: eta: 3 days, 20:36:29 iteration: 94/335008 consumed_samples: 380 total_loss: 6.983 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:26] lb.utils.events INFO: eta: 3 days, 20:36:31 iteration: 95/335008 consumed_samples: 384 total_loss: 6.963 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:27] lb.utils.events INFO: eta: 3 days, 20:36:27 iteration: 96/335008 consumed_samples: 388 total_loss: 6.942 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:28] lb.utils.events INFO: eta: 3 days, 20:36:29 iteration: 97/335008 consumed_samples: 392 total_loss: 6.924 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:29] lb.utils.events INFO: eta: 3 days, 20:36:25 iteration: 98/335008 consumed_samples: 396 total_loss: 6.905 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:30] lb.utils.events INFO: eta: 3 days, 20:36:27 iteration: 99/335008 consumed_samples: 400 total_loss: 6.86 time: 0.9956 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:31] lb.utils.events INFO: eta: 3 days, 20:36:29 iteration: 100/335008 consumed_samples: 404 total_loss: 6.815 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:32] lb.utils.events INFO: eta: 3 days, 20:36:25 iteration: 101/335008 consumed_samples: 408 total_loss: 6.808 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:33] lb.utils.events INFO: eta: 3 days, 20:36:21 iteration: 102/335008 consumed_samples: 412 total_loss: 6.8 time: 0.9956 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:34] lb.utils.events INFO: eta: 3 days, 20:36:23 iteration: 103/335008 consumed_samples: 416 total_loss: 6.787 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:35] lb.utils.events INFO: eta: 3 days, 20:36:19 iteration: 104/335008 consumed_samples: 420 total_loss: 6.774 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:36] lb.utils.events INFO: eta: 3 days, 20:36:21 iteration: 105/335008 consumed_samples: 424 total_loss: 6.767 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:37] lb.utils.events INFO: eta: 3 days, 20:36:23 iteration: 106/335008 consumed_samples: 428 total_loss: 6.76 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:38] lb.utils.events INFO: eta: 3 days, 20:36:43 iteration: 107/335008 consumed_samples: 432 total_loss: 6.717 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:39] lb.utils.events INFO: eta: 3 days, 20:36:21 iteration: 108/335008 consumed_samples: 436 total_loss: 6.673 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:40] lb.utils.events INFO: eta: 3 days, 20:36:41 iteration: 109/335008 consumed_samples: 440 total_loss: 6.659 time: 0.9957 s/iter data_time: 0.0009 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:41] lb.utils.events INFO: eta: 3 days, 20:36:19 iteration: 110/335008 consumed_samples: 444 total_loss: 6.645 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:42] lb.utils.events INFO: eta: 3 days, 20:36:39 iteration: 111/335008 consumed_samples: 448 total_loss: 6.59 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:43] lb.utils.events INFO: eta: 3 days, 20:36:17 iteration: 112/335008 consumed_samples: 452 total_loss: 6.534 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:44] lb.utils.events INFO: eta: 3 days, 20:36:37 iteration: 113/335008 consumed_samples: 456 total_loss: 6.522 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:45] lb.utils.events INFO: eta: 3 days, 20:36:15 iteration: 114/335008 consumed_samples: 460 total_loss: 6.51 time: 0.9957 s/iter data_time: 0.0010 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:46] lb.utils.events INFO: eta: 3 days, 20:36:11 iteration: 115/335008 consumed_samples: 464 total_loss: 6.486 time: 0.9956 s/iter data_time: 0.0011 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:47] lb.utils.events INFO: eta: 3 days, 20:36:13 iteration: 116/335008 consumed_samples: 468 total_loss: 6.461 time: 0.9957 s/iter data_time: 0.0011 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 -[04/17 16:47:48] lb.utils.events INFO: eta: 3 days, 20:36:09 iteration: 117/335008 consumed_samples: 472 total_loss: 6.459 time: 0.9956 s/iter data_time: 0.0011 s/iter total_throughput: 4.02 samples/s lr: 5.00e-05 diff --git a/projects/MagicPrompt/oneflow_magicprompt/metrics.json b/projects/MagicPrompt/oneflow_magicprompt/metrics.json deleted file mode 100644 index aff509e45..000000000 --- a/projects/MagicPrompt/oneflow_magicprompt/metrics.json +++ /dev/null @@ -1,118 +0,0 @@ -{"data_time": 0.0013080858625471592, "iteration": 0, "lr": 5e-05, "total_loss": 10.86047077178955} -{"data_time": 0.0011704793432727456, "iteration": 1, "lr": 5e-05, "total_loss": 10.084524154663086} -{"data_time": 0.001032872823998332, "eta_seconds": 334975.14907222823, "iteration": 2, "lr": 5e-05, "time": 0.9999108940828592, "total_loss": 9.451911926269531} -{"data_time": 0.0011704793432727456, "eta_seconds": 333974.74642584706, "iteration": 3, "lr": 5e-05, "time": 0.9969276379561052, "total_loss": 9.380244731903076} -{"data_time": 0.001032872823998332, "eta_seconds": 333421.4348163572, "iteration": 4, "lr": 5e-05, "time": 0.995278952177614, "total_loss": 9.308577537536621} -{"data_time": 0.0011704793432727456, "eta_seconds": 333381.0690537989, "iteration": 5, "lr": 5e-05, "time": 0.9951614290475845, "total_loss": 9.213211059570312} -{"data_time": 0.001032872823998332, "eta_seconds": 333419.44425845286, "iteration": 6, "lr": 5e-05, "time": 0.995278952177614, "total_loss": 9.117844581604004} -{"data_time": 0.0010060754138976336, "eta_seconds": 333823.492100928, "iteration": 7, "lr": 5e-05, "time": 0.9964880361221731, "total_loss": 9.096528053283691} -{"data_time": 0.000979278003796935, "eta_seconds": 333417.4537005485, "iteration": 8, "lr": 5e-05, "time": 0.995278952177614, "total_loss": 9.075211524963379} -{"data_time": 0.0008644059998914599, "eta_seconds": 333821.49912485573, "iteration": 9, "lr": 5e-05, "time": 0.9964880361221731, "total_loss": 8.991516590118408} -{"data_time": 0.0007495339959859848, "eta_seconds": 333643.7865908735, "iteration": 10, "lr": 5e-05, "time": 0.9959605208132416, "total_loss": 8.907821655273438} -{"data_time": 0.0008644059998914599, "eta_seconds": 333844.5914174495, "iteration": 11, "lr": 5e-05, "time": 0.9965629184152931, "total_loss": 8.860907554626465} -{"data_time": 0.000979278003796935, "eta_seconds": 333641.79466983187, "iteration": 12, "lr": 5e-05, "time": 0.9959605208132416, "total_loss": 8.813993453979492} -{"data_time": 0.0010060754138976336, "eta_seconds": 333526.63800754934, "iteration": 13, "lr": 5e-05, "time": 0.9956197364954278, "total_loss": 8.797540187835693} -{"data_time": 0.000979278003796935, "eta_seconds": 333639.80274879024, "iteration": 14, "lr": 5e-05, "time": 0.9959605208132416, "total_loss": 8.781086921691895} -{"data_time": 0.0010060754138976336, "eta_seconds": 333606.8080229815, "iteration": 15, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.74678087234497} -{"data_time": 0.001032872823998332, "eta_seconds": 333573.81348821474, "iteration": 16, "lr": 5e-05, "time": 0.9957694788463414, "total_loss": 8.712474822998047} -{"data_time": 0.0010634929640218616, "eta_seconds": 333604.81629298185, "iteration": 17, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.699904918670654} -{"data_time": 0.001032872823998332, "eta_seconds": 333635.818906707, "iteration": 18, "lr": 5e-05, "time": 0.9959605208132416, "total_loss": 8.687335014343262} -{"data_time": 0.0010060754138976336, "eta_seconds": 333602.8245629822, "iteration": 19, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.670326232910156} -{"data_time": 0.0008878930239006877, "eta_seconds": 333633.82698566536, "iteration": 20, "lr": 5e-05, "time": 0.9959605208132416, "total_loss": 8.572020053863525} -{"data_time": 0.0008878930239006877, "eta_seconds": 333600.83283298253, "iteration": 21, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.490070819854736} -{"data_time": 0.0007828865200281143, "eta_seconds": 333631.83506462374, "iteration": 22, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.478554248809814} -{"data_time": 0.0007828865200281143, "eta_seconds": 333598.8411029829, "iteration": 23, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.46279764175415} -{"data_time": 0.0007860750192776322, "eta_seconds": 333565.847332384, "iteration": 24, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.406514644622803} -{"data_time": 0.0007860750192776322, "eta_seconds": 333596.8493729832, "iteration": 25, "lr": 5e-05, "time": 0.9962384009268135, "total_loss": 8.34181833267212} -{"data_time": 0.0007860750192776322, "eta_seconds": 333563.8557934263, "iteration": 26, "lr": 5e-05, "time": 0.9958649998297915, "total_loss": 8.317673206329346} -{"data_time": 0.0007860750192776322, "eta_seconds": 333480.7017122023, "iteration": 27, "lr": 5e-05, "time": 0.995359412394464, "total_loss": 8.30541181564331} -{"data_time": 0.0007860750192776322, "eta_seconds": 333437.3543028466, "iteration": 28, "lr": 5e-05, "time": 0.9955836314475164, "total_loss": 8.214956283569336} -{"data_time": 0.0007860750192776322, "eta_seconds": 333498.61369502614, "iteration": 29, "lr": 5e-05, "time": 0.9955836314475164, "total_loss": 8.110542297363281} -{"data_time": 0.0007860750192776322, "eta_seconds": 333435.3635072785, "iteration": 30, "lr": 5e-05, "time": 0.995173564995639, "total_loss": 8.091084957122803} -{"data_time": 0.0007860750192776322, "eta_seconds": 333414.46519707143, "iteration": 31, "lr": 5e-05, "time": 0.9948321470292285, "total_loss": 8.06343412399292} -{"data_time": 0.0007724534953013062, "eta_seconds": 333433.3727117104, "iteration": 32, "lr": 5e-05, "time": 0.995173564995639, "total_loss": 8.009804964065552} -{"data_time": 0.0007724534953013062, "eta_seconds": 333412.4745203352, "iteration": 33, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.94881272315979} -{"data_time": 0.0007724534953013062, "eta_seconds": 333431.3819161423, "iteration": 34, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.885828018188477} -{"data_time": 0.0007724534953013062, "eta_seconds": 333410.483843599, "iteration": 35, "lr": 5e-05, "time": 0.995035731466487, "total_loss": 7.850603103637695} -{"data_time": 0.0007724534953013062, "eta_seconds": 333389.58588988753, "iteration": 36, "lr": 5e-05, "time": 0.995035731466487, "total_loss": 7.840587615966797} -{"data_time": 0.0007724534953013062, "eta_seconds": 333408.49316686275, "iteration": 37, "lr": 5e-05, "time": 0.995035731466487, "total_loss": 7.791827201843262} -{"data_time": 0.0007724534953013062, "eta_seconds": 333427.4003250061, "iteration": 38, "lr": 5e-05, "time": 0.995035731466487, "total_loss": 7.7047717571258545} -{"data_time": 0.0007869779365137219, "eta_seconds": 333435.32243600953, "iteration": 39, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.658276796340942} -{"data_time": 0.0008322594221681356, "eta_seconds": 333443.244493769, "iteration": 40, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.65231728553772} -{"data_time": 0.0008322594221681356, "eta_seconds": 333433.3315871975, "iteration": 41, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.627635717391968} -{"data_time": 0.0008322594221681356, "eta_seconds": 333423.4187338699, "iteration": 42, "lr": 5e-05, "time": 0.99503821041435, "total_loss": 7.562398672103882} -{"data_time": 0.0007869779365137219, "eta_seconds": 333431.34073838545, "iteration": 43, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.515561819076538} -{"data_time": 0.0008322594221681356, "eta_seconds": 333421.4279383018, "iteration": 44, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.481845140457153} -{"data_time": 0.0007762979948893189, "eta_seconds": 333429.3498895734, "iteration": 45, "lr": 5e-05, "time": 0.9952599505195394, "total_loss": 7.446049690246582} -{"data_time": 0.0007762979948893189, "eta_seconds": 333437.27178760106, "iteration": 46, "lr": 5e-05, "time": 0.9954244060209021, "total_loss": 7.412311315536499} -{"data_time": 0.0007762979948893189, "eta_seconds": 333427.35904076137, "iteration": 47, "lr": 5e-05, "time": 0.9954244060209021, "total_loss": 7.380153179168701} -{"data_time": 0.0007762979948893189, "eta_seconds": 333435.2808855451, "iteration": 48, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.36942195892334} -{"data_time": 0.0007897350005805492, "eta_seconds": 333425.3681919493, "iteration": 49, "lr": 5e-05, "time": 0.9953473334899172, "total_loss": 7.342585325241089} -{"data_time": 0.0008361989166587591, "eta_seconds": 333433.2899834891, "iteration": 50, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.3071043491363525} -{"data_time": 0.0007897350005805492, "eta_seconds": 333423.3773431373, "iteration": 51, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.287859916687012} -{"data_time": 0.0008361989166587591, "eta_seconds": 333431.2990814331, "iteration": 52, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.242215156555176} -{"data_time": 0.0008361989166587591, "eta_seconds": 333428.3312471411, "iteration": 53, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.149794101715088} -{"data_time": 0.0008361989166587591, "eta_seconds": 333425.3634246262, "iteration": 54, "lr": 5e-05, "time": 0.9954451394733042, "total_loss": 7.0815417766571045} -{"data_time": 0.0007897350005805492, "eta_seconds": 333426.3403568622, "iteration": 55, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 7.035864591598511} -{"data_time": 0.0007897350005805492, "eta_seconds": 333423.3725461243, "iteration": 56, "lr": 5e-05, "time": 0.9955393135314807, "total_loss": 6.996466159820557} -{"data_time": 0.0008276200387626886, "eta_seconds": 333415.43243699125, "iteration": 57, "lr": 5e-05, "time": 0.9955334250116721, "total_loss": 6.962551116943359} -{"data_time": 0.0008890159660950303, "eta_seconds": 333421.38166762237, "iteration": 58, "lr": 5e-05, "time": 0.9957565000513569, "total_loss": 6.923537731170654} -{"data_time": 0.0008890159660950303, "eta_seconds": 333422.3585763043, "iteration": 59, "lr": 5e-05, "time": 0.9957565000513569, "total_loss": 6.860137939453125} -{"data_time": 0.0008890159660950303, "eta_seconds": 333419.39078912046, "iteration": 60, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.807734727859497} -{"data_time": 0.0008276200387626886, "eta_seconds": 333411.45076292125, "iteration": 61, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.787320613861084} -{"data_time": 0.0008890159660950303, "eta_seconds": 333403.51077818894, "iteration": 62, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.767277956008911} -{"data_time": 0.0009069920051842928, "eta_seconds": 333409.45992588624, "iteration": 63, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.716540575027466} -{"data_time": 0.0008890159660950303, "eta_seconds": 333415.40903211664, "iteration": 64, "lr": 5e-05, "time": 0.9955666183959693, "total_loss": 6.658992528915405} -{"data_time": 0.0008890159660950303, "eta_seconds": 333407.46908885124, "iteration": 65, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.589884996414185} -{"data_time": 0.0008890159660950303, "eta_seconds": 333399.52918705274, "iteration": 66, "lr": 5e-05, "time": 0.9952532815514132, "total_loss": 6.522307395935059} -{"data_time": 0.0009069920051842928, "eta_seconds": 333405.47825181624, "iteration": 67, "lr": 5e-05, "time": 0.9953510875348002, "total_loss": 6.4856483936309814} -{"data_time": 0.0009069920051842928, "eta_seconds": 333397.53839148465, "iteration": 68, "lr": 5e-05, "time": 0.9952110890299082, "total_loss": 6.458977699279785} -{"data_time": 0.0009069920051842928, "eta_seconds": 333376.64233908313, "iteration": 69, "lr": 5e-05, "time": 0.9952110890299082, "total_loss": 6.435722827911377} -{"data_time": 0.0009069920051842928, "eta_seconds": 333355.7464055135, "iteration": 70, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.397557020187378} -{"data_time": 0.0009069920051842928, "eta_seconds": 333374.6516623469, "iteration": 71, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.370809316635132} -{"data_time": 0.0009069920051842928, "eta_seconds": 333353.75584760914, "iteration": 72, "lr": 5e-05, "time": 0.9948076440487057, "total_loss": 6.359989166259766} -{"data_time": 0.0008708799723535776, "eta_seconds": 333350.0763972802, "iteration": 73, "lr": 5e-05, "time": 0.994340977515094, "total_loss": 6.351602792739868} -{"data_time": 0.0008656535064801574, "eta_seconds": 333351.7652897048, "iteration": 74, "lr": 5e-05, "time": 0.9948076440487057, "total_loss": 6.33983039855957} -{"data_time": 0.0009017655393108726, "eta_seconds": 333370.67030887445, "iteration": 75, "lr": 5e-05, "time": 0.9948076440487057, "total_loss": 6.322655439376831} -{"data_time": 0.0009017655393108726, "eta_seconds": 333389.57520921226, "iteration": 76, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.28255558013916} -{"data_time": 0.0009017655393108726, "eta_seconds": 333368.6796321382, "iteration": 77, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.254348039627075} -{"data_time": 0.0008656535064801574, "eta_seconds": 333387.58441364416, "iteration": 78, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.229153394699097} -{"data_time": 0.000795042491517961, "eta_seconds": 333366.688955402, "iteration": 79, "lr": 5e-05, "time": 0.994889885536395, "total_loss": 6.190645933151245} -{"data_time": 0.000795042491517961, "eta_seconds": 333385.59361807606, "iteration": 80, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.163251161575317} -{"data_time": 0.0008184099569916725, "eta_seconds": 333364.6982786658, "iteration": 81, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.1208086013793945} -{"data_time": 0.0008184099569916725, "eta_seconds": 333343.80305808736, "iteration": 82, "lr": 5e-05, "time": 0.995154956006445, "total_loss": 6.085185289382935} -{"data_time": 0.0008184099569916725, "eta_seconds": 333340.12368789874, "iteration": 83, "lr": 5e-05, "time": 0.9950716779567301, "total_loss": 6.049023389816284} -{"data_time": 0.0008462514961138368, "eta_seconds": 333341.812500183, "iteration": 84, "lr": 5e-05, "time": 0.9950716779567301, "total_loss": 5.980789422988892} -{"data_time": 0.0008462514961138368, "eta_seconds": 333360.7169251933, "iteration": 85, "lr": 5e-05, "time": 0.9950759758939967, "total_loss": 5.924633741378784} -{"data_time": 0.0009010385256260633, "eta_seconds": 333379.62123137177, "iteration": 86, "lr": 5e-05, "time": 0.995314322062768, "total_loss": 5.882171154022217} -{"data_time": 0.0008462514961138368, "eta_seconds": 333385.5698814662, "iteration": 87, "lr": 5e-05, "time": 0.995314322062768, "total_loss": 5.864503860473633} -{"data_time": 0.0008516814559698105, "eta_seconds": 333391.51849009376, "iteration": 88, "lr": 5e-05, "time": 0.9955310776131228, "total_loss": 5.848897457122803} -{"data_time": 0.0009010385256260633, "eta_seconds": 333392.4952221201, "iteration": 89, "lr": 5e-05, "time": 0.9959323420189321, "total_loss": 5.83418869972229} -{"data_time": 0.0009010385256260633, "eta_seconds": 333393.47194236936, "iteration": 90, "lr": 5e-05, "time": 0.9959382290253416, "total_loss": 5.8199005126953125} -{"data_time": 0.0009010385256260633, "eta_seconds": 333395.55138673866, "iteration": 91, "lr": 5e-05, "time": 0.9959382290253416, "total_loss": 5.7816338539123535} -{"data_time": 0.0008516814559698105, "eta_seconds": 333391.4810403134, "iteration": 92, "lr": 5e-05, "time": 0.9959382290253416, "total_loss": 5.732883930206299} -{"data_time": 0.0008981249993667006, "eta_seconds": 333393.5604663205, "iteration": 93, "lr": 5e-05, "time": 0.9964770964579657, "total_loss": 5.694352626800537} -{"data_time": 0.0008563114097341895, "eta_seconds": 333389.4901382574, "iteration": 94, "lr": 5e-05, "time": 0.996143406489864, "total_loss": 5.668796062469482} -{"data_time": 0.0008044379064813256, "eta_seconds": 333391.5695459023, "iteration": 95, "lr": 5e-05, "time": 0.9966910509392619, "total_loss": 5.646285057067871} -{"data_time": 0.0008044379064813256, "eta_seconds": 333387.4992362014, "iteration": 96, "lr": 5e-05, "time": 0.9961521835066378, "total_loss": 5.641308546066284} -{"data_time": 0.0008044379064813256, "eta_seconds": 333389.57862548414, "iteration": 97, "lr": 5e-05, "time": 0.9967898100148886, "total_loss": 5.636359930038452} -{"data_time": 0.0008563114097341895, "eta_seconds": 333385.5083341454, "iteration": 98, "lr": 5e-05, "time": 0.9961521835066378, "total_loss": 5.636359930038452} -{"data_time": 0.0009056684793904424, "eta_seconds": 333387.58770506596, "iteration": 99, "lr": 5e-05, "time": 0.9961569545557722, "total_loss": 5.631777763366699} -{"data_time": 0.0009056684793904424, "eta_seconds": 333389.6670576243, "iteration": 100, "lr": 5e-05, "time": 0.9961569545557722, "total_loss": 5.621784687042236} -{"data_time": 0.0009087480138987303, "eta_seconds": 333385.5967846478, "iteration": 101, "lr": 5e-05, "time": 0.9961569545557722, "total_loss": 5.610760450363159} -{"data_time": 0.0009087480138987303, "eta_seconds": 333381.52653003344, "iteration": 102, "lr": 5e-05, "time": 0.9961569545557722, "total_loss": 5.5903894901275635} -{"data_time": 0.000937630538828671, "eta_seconds": 333383.6058642296, "iteration": 103, "lr": 5e-05, "time": 0.9970831935061142, "total_loss": 5.5698912143707275} -{"data_time": 0.0009058344876393676, "eta_seconds": 333379.53562797746, "iteration": 104, "lr": 5e-05, "time": 0.9961569545557722, "total_loss": 5.5698912143707275} -{"data_time": 0.0009058344876393676, "eta_seconds": 333381.61494381144, "iteration": 105, "lr": 5e-05, "time": 0.9968412938760594, "total_loss": 5.565072059631348} -{"data_time": 0.0008784519741311669, "eta_seconds": 333383.6942412832, "iteration": 106, "lr": 5e-05, "time": 0.9966005973983556, "total_loss": 5.5545384883880615} -{"data_time": 0.0008784519741311669, "eta_seconds": 333403.3578926348, "iteration": 107, "lr": 5e-05, "time": 0.9962673855479807, "total_loss": 5.547092437744141} -{"data_time": 0.0008784519741311669, "eta_seconds": 333381.70330250286, "iteration": 108, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.544716119766235} -{"data_time": 0.0008784519741311669, "eta_seconds": 333401.3668304796, "iteration": 109, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.515957355499268} -{"data_time": 0.0009058344876393676, "eta_seconds": 333379.7123637225, "iteration": 110, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.470178127288818} -{"data_time": 0.000937630538828671, "eta_seconds": 333399.37576832436, "iteration": 111, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.440483808517456} -{"data_time": 0.000937630538828671, "eta_seconds": 333377.72142494214, "iteration": 112, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.419479846954346} -{"data_time": 0.0009715190390124917, "eta_seconds": 333397.38470616913, "iteration": 113, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.406724214553833} -{"data_time": 0.0009715190390124917, "eta_seconds": 333375.7304861618, "iteration": 114, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.406724214553833} -{"data_time": 0.0010186959989368916, "eta_seconds": 333371.66034172056, "iteration": 115, "lr": 5e-05, "time": 0.9952741335146129, "total_loss": 5.401771306991577} -{"data_time": 0.0010186959989368916, "eta_seconds": 333373.7395473814, "iteration": 116, "lr": 5e-05, "time": 0.9959162580780685, "total_loss": 5.377491474151611} -{"data_time": 0.0009428025223314762, "eta_seconds": 333369.6694213024, "iteration": 117, "lr": 5e-05, "time": 0.9952741335146129, "total_loss": 5.355705738067627} From 64263fe5aa5e73939521357a9eeb2b9458051995 Mon Sep 17 00:00:00 2001 From: Houjiang Chen Date: Thu, 4 May 2023 18:24:22 +0800 Subject: [PATCH 24/25] update mlu gpt2 dataset url (#502) --- projects/MagicPrompt/configs/gpt2_inference.py | 6 +++--- projects/MagicPrompt/configs/gpt2_training.py | 6 +++--- projects/MagicPrompt/pipeline.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/projects/MagicPrompt/configs/gpt2_inference.py b/projects/MagicPrompt/configs/gpt2_inference.py index 11dea455a..565fcdc1e 100644 --- a/projects/MagicPrompt/configs/gpt2_inference.py +++ b/projects/MagicPrompt/configs/gpt2_inference.py @@ -58,13 +58,13 @@ sep_token_id=None, decoder_start_token_id=None, # train - pretrained_model_path="/home/zhangxiaoyu/oneflow-model", + pretrained_model_path="./oneflow-model", ) model = LazyCall(GPTModel)(cfg=cfg) pretrain_model = LazyCall(GPTForPreTraining)(cfg=cfg) tokenization.tokenizer = LazyCall(mock_tokenization.GPT2Tokenizer)( - vocab_file="/home/zhangxiaoyu/oneflow-model/vocab.json", - merges_file="/home/zhangxiaoyu/oneflow-model/merges.txt", + vocab_file="./oneflow-model/vocab.json", + merges_file="./oneflow-model/merges.txt", ) diff --git a/projects/MagicPrompt/configs/gpt2_training.py b/projects/MagicPrompt/configs/gpt2_training.py index f715ee666..b2cecdb36 100644 --- a/projects/MagicPrompt/configs/gpt2_training.py +++ b/projects/MagicPrompt/configs/gpt2_training.py @@ -12,9 +12,9 @@ graph.enabled=False graph.debug = 2 -vocab_file = "/home/zhangxiaoyu/magicprompt/vocab.json" -merge_files = "/home/zhangxiaoyu/magicprompt/merges.txt" -train_data_prefix = "/home/zhangxiaoyu/magicprompt/train/en_train_mmap_text_sentence" +vocab_file = "./magicprompt/vocab.json" +merge_files = "./magicprompt/merges.txt" +train_data_prefix = "./magicprompt/train/en_train_mmap_text_sentence" tokenization.tokenizer.vocab_file = vocab_file tokenization.tokenizer.merges_file = merge_files diff --git a/projects/MagicPrompt/pipeline.py b/projects/MagicPrompt/pipeline.py index faabb3c36..919149be2 100644 --- a/projects/MagicPrompt/pipeline.py +++ b/projects/MagicPrompt/pipeline.py @@ -97,7 +97,7 @@ def postprocess(self, model_output_dict, **kwargs) -> dict: pipeline_parallel=1, # pipeline_stage_id=[0] * 6 + [1] * 6, # pipeline_num_layers=12, - model_path="/home/zhangxiaoyu/oneflow-model/model", + model_path="oneflow-model/model", mode="libai", ) From 2b800eece56d3c527913183d03a523043fe2998c Mon Sep 17 00:00:00 2001 From: Houjiang Chen Date: Fri, 12 May 2023 18:41:57 +0800 Subject: [PATCH 25/25] import oneflow_mlu (#505) --- tools/train_net.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/train_net.py b/tools/train_net.py index 1d6080631..a7c684910 100644 --- a/tools/train_net.py +++ b/tools/train_net.py @@ -20,6 +20,7 @@ import numpy as np import oneflow as flow +import oneflow_mlu sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) from libai.config import LazyConfig, default_argument_parser, try_get_key